howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26 27from howard.functions.commons import * 28from howard.objects.database import * 29from howard.functions.databases import * 30from howard.functions.utils import * 31 32 33class Variants: 34 35 def __init__( 36 self, 37 conn=None, 38 input: str = None, 39 output: str = None, 40 config: dict = {}, 41 param: dict = {}, 42 load: bool = False, 43 ) -> None: 44 """ 45 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 46 header 47 48 :param conn: the connection to the database 49 :param input: the input file 50 :param output: the output file 51 :param config: a dictionary containing the configuration of the model 52 :param param: a dictionary containing the parameters of the model 53 """ 54 55 # Init variables 56 self.init_variables() 57 58 # Input 59 self.set_input(input) 60 61 # Config 62 self.set_config(config) 63 64 # Param 65 self.set_param(param) 66 67 # Output 68 self.set_output(output) 69 70 # connexion 71 self.set_connexion(conn) 72 73 # Header 74 self.set_header() 75 76 # Samples 77 self.set_samples() 78 79 # Load data 80 if load: 81 self.load_data() 82 83 def set_samples(self, samples: list = None) -> list: 84 """ 85 The function `set_samples` sets the samples attribute of an object to a provided list or 86 retrieves it from a parameter dictionary. 87 88 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 89 input and sets the `samples` attribute of the class to the provided list. If no samples are 90 provided, it tries to get the samples from the class's parameters using the `get_param` method 91 :type samples: list 92 :return: The `samples` list is being returned. 93 """ 94 95 if not samples: 96 samples = self.get_param().get("samples", {}).get("list", None) 97 98 self.samples = samples 99 100 return samples 101 102 def get_samples(self) -> list: 103 """ 104 This function returns a list of samples. 105 :return: The `get_samples` method is returning the `samples` attribute of the object. 106 """ 107 108 return self.samples 109 110 def get_samples_check(self) -> bool: 111 """ 112 This function returns the value of the "check" key within the "samples" dictionary retrieved 113 from the parameters. 114 :return: The method `get_samples_check` is returning the value of the key "check" inside the 115 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 116 method. If the key "check" is not found, it will return `False`. 117 """ 118 119 return self.get_param().get("samples", {}).get("check", True) 120 121 def set_input(self, input: str = None) -> None: 122 """ 123 The function `set_input` takes a file name as input, extracts the name and extension, and sets 124 attributes in the class accordingly. 125 126 :param input: The `set_input` method in the provided code snippet is used to set attributes 127 related to the input file. Here's a breakdown of the parameters and their usage in the method: 128 :type input: str 129 """ 130 131 if input and not isinstance(input, str): 132 try: 133 self.input = input.name 134 except: 135 log.error(f"Input file '{input} in bad format") 136 raise ValueError(f"Input file '{input} in bad format") 137 else: 138 self.input = input 139 140 # Input format 141 if input: 142 input_name, input_extension = os.path.splitext(self.input) 143 self.input_name = input_name 144 self.input_extension = input_extension 145 self.input_format = self.input_extension.replace(".", "") 146 147 def set_config(self, config: dict) -> None: 148 """ 149 The set_config function takes a config object and assigns it as the configuration object for the 150 class. 151 152 :param config: The `config` parameter in the `set_config` function is a dictionary object that 153 contains configuration settings for the class. When you call the `set_config` function with a 154 dictionary object as the argument, it will set that dictionary as the configuration object for 155 the class 156 :type config: dict 157 """ 158 159 self.config = config 160 161 def set_param(self, param: dict) -> None: 162 """ 163 This function sets a parameter object for the class based on the input dictionary. 164 165 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 166 as the `param` attribute of the class instance 167 :type param: dict 168 """ 169 170 self.param = param 171 172 def init_variables(self) -> None: 173 """ 174 This function initializes the variables that will be used in the rest of the class 175 """ 176 177 self.prefix = "howard" 178 self.table_variants = "variants" 179 self.dataframe = None 180 181 self.comparison_map = { 182 "gt": ">", 183 "gte": ">=", 184 "lt": "<", 185 "lte": "<=", 186 "equals": "=", 187 "contains": "SIMILAR TO", 188 } 189 190 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 191 192 self.code_type_map_to_sql = { 193 "Integer": "INTEGER", 194 "String": "VARCHAR", 195 "Float": "FLOAT", 196 "Flag": "VARCHAR", 197 } 198 199 self.index_additionnal_fields = [] 200 201 def get_indexing(self) -> bool: 202 """ 203 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 204 returns False. 205 :return: The value of the indexing parameter. 206 """ 207 208 return self.get_param().get("indexing", False) 209 210 def get_connexion_config(self) -> dict: 211 """ 212 The function `get_connexion_config` returns a dictionary containing the configuration for a 213 connection, including the number of threads and memory limit. 214 :return: a dictionary containing the configuration for the Connexion library. 215 """ 216 217 # config 218 config = self.get_config() 219 220 # Connexion config 221 connexion_config = {} 222 threads = self.get_threads() 223 224 # Threads 225 if threads: 226 connexion_config["threads"] = threads 227 228 # Memory 229 # if config.get("memory", None): 230 # connexion_config["memory_limit"] = config.get("memory") 231 if self.get_memory(): 232 connexion_config["memory_limit"] = self.get_memory() 233 234 # Temporary directory 235 if config.get("tmp", None): 236 connexion_config["temp_directory"] = config.get("tmp") 237 238 # Access 239 if config.get("access", None): 240 access = config.get("access") 241 if access in ["RO"]: 242 access = "READ_ONLY" 243 elif access in ["RW"]: 244 access = "READ_WRITE" 245 connexion_db = self.get_connexion_db() 246 if connexion_db in ":memory:": 247 access = "READ_WRITE" 248 connexion_config["access_mode"] = access 249 250 return connexion_config 251 252 def get_duckdb_settings(self) -> dict: 253 """ 254 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 255 string. 256 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 257 """ 258 259 # config 260 config = self.get_config() 261 262 # duckdb settings 263 duckdb_settings_dict = {} 264 if config.get("duckdb_settings", None): 265 duckdb_settings = config.get("duckdb_settings") 266 duckdb_settings = full_path(duckdb_settings) 267 # duckdb setting is a file 268 if os.path.exists(duckdb_settings): 269 with open(duckdb_settings) as json_file: 270 duckdb_settings_dict = yaml.safe_load(json_file) 271 # duckdb settings is a string 272 else: 273 duckdb_settings_dict = json.loads(duckdb_settings) 274 275 return duckdb_settings_dict 276 277 def set_connexion_db(self) -> str: 278 """ 279 The function `set_connexion_db` returns the appropriate database connection string based on the 280 input format and connection type. 281 :return: the value of the variable `connexion_db`. 282 """ 283 284 # Default connexion db 285 default_connexion_db = ":memory:" 286 287 # Find connexion db 288 if self.get_input_format() in ["db", "duckdb"]: 289 connexion_db = self.get_input() 290 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 291 connexion_db = default_connexion_db 292 elif self.get_connexion_type() in ["tmpfile"]: 293 tmp_name = tempfile.mkdtemp( 294 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 295 ) 296 connexion_db = f"{tmp_name}/tmp.db" 297 elif self.get_connexion_type() != "": 298 connexion_db = self.get_connexion_type() 299 else: 300 connexion_db = default_connexion_db 301 302 # Set connexion db 303 self.connexion_db = connexion_db 304 305 return connexion_db 306 307 def set_connexion(self, conn) -> None: 308 """ 309 The function `set_connexion` creates a connection to a database, with options for different 310 database formats and settings. 311 312 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 313 database. If a connection is not provided, a new connection to an in-memory database is created. 314 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 315 sqlite 316 """ 317 318 # Connexion db 319 connexion_db = self.set_connexion_db() 320 321 # Connexion config 322 connexion_config = self.get_connexion_config() 323 324 # Connexion format 325 connexion_format = self.get_config().get("connexion_format", "duckdb") 326 # Set connexion format 327 self.connexion_format = connexion_format 328 329 # Connexion 330 if not conn: 331 if connexion_format in ["duckdb"]: 332 conn = duckdb.connect(connexion_db, config=connexion_config) 333 # duckDB settings 334 duckdb_settings = self.get_duckdb_settings() 335 if duckdb_settings: 336 for setting in duckdb_settings: 337 setting_value = duckdb_settings.get(setting) 338 if isinstance(setting_value, str): 339 setting_value = f"'{setting_value}'" 340 conn.execute(f"PRAGMA {setting}={setting_value};") 341 elif connexion_format in ["sqlite"]: 342 conn = sqlite3.connect(connexion_db) 343 344 # Set connexion 345 self.conn = conn 346 347 # Log 348 log.debug(f"connexion_format: {connexion_format}") 349 log.debug(f"connexion_db: {connexion_db}") 350 log.debug(f"connexion config: {connexion_config}") 351 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 352 353 def set_output(self, output: str = None) -> None: 354 """ 355 The `set_output` function in Python sets the output file based on the input or a specified key 356 in the config file, extracting the output name, extension, and format. 357 358 :param output: The `output` parameter in the `set_output` method is used to specify the name of 359 the output file. If the config file has an 'output' key, the method sets the output to the value 360 of that key. If no output is provided, it sets the output to `None` 361 :type output: str 362 """ 363 364 if output and not isinstance(output, str): 365 self.output = output.name 366 else: 367 self.output = output 368 369 # Output format 370 if self.output: 371 output_name, output_extension = os.path.splitext(self.output) 372 self.output_name = output_name 373 self.output_extension = output_extension 374 self.output_format = self.output_extension.replace(".", "") 375 else: 376 self.output_name = None 377 self.output_extension = None 378 self.output_format = None 379 380 def set_header(self) -> None: 381 """ 382 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 383 """ 384 385 input_file = self.get_input() 386 default_header_list = [ 387 "##fileformat=VCFv4.2", 388 "#CHROM POS ID REF ALT QUAL FILTER INFO", 389 ] 390 391 # Full path 392 input_file = full_path(input_file) 393 394 if input_file: 395 396 input_format = self.get_input_format() 397 input_compressed = self.get_input_compressed() 398 config = self.get_config() 399 header_list = default_header_list 400 if input_format in [ 401 "vcf", 402 "hdr", 403 "tsv", 404 "csv", 405 "psv", 406 "parquet", 407 "db", 408 "duckdb", 409 ]: 410 # header provided in param 411 if config.get("header_file", None): 412 with open(config.get("header_file"), "rt") as f: 413 header_list = self.read_vcf_header(f) 414 # within a vcf file format (header within input file itsself) 415 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 416 # within a compressed vcf file format (.vcf.gz) 417 if input_compressed: 418 with bgzf.open(input_file, "rt") as f: 419 header_list = self.read_vcf_header(f) 420 # within an uncompressed vcf file format (.vcf) 421 else: 422 with open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # header provided in default external file .hdr 425 elif os.path.exists((input_file + ".hdr")): 426 with open(input_file + ".hdr", "rt") as f: 427 header_list = self.read_vcf_header(f) 428 else: 429 try: # Try to get header info fields and file columns 430 431 with tempfile.TemporaryDirectory() as tmpdir: 432 433 # Create database 434 db_for_header = Database(database=input_file) 435 436 # Get header columns for infos fields 437 db_header_from_columns = ( 438 db_for_header.get_header_from_columns() 439 ) 440 441 # Get real columns in the file 442 db_header_columns = db_for_header.get_columns() 443 444 # Write header file 445 header_file_tmp = os.path.join(tmpdir, "header") 446 f = open(header_file_tmp, "w") 447 vcf.Writer(f, db_header_from_columns) 448 f.close() 449 450 # Replace #CHROM line with rel columns 451 header_list = db_for_header.read_header_file( 452 header_file=header_file_tmp 453 ) 454 header_list[-1] = "\t".join(db_header_columns) 455 456 except: 457 458 log.warning( 459 f"No header for file {input_file}. Set as default VCF header" 460 ) 461 header_list = default_header_list 462 463 else: # try for unknown format ? 464 465 log.error(f"Input file format '{input_format}' not available") 466 raise ValueError(f"Input file format '{input_format}' not available") 467 468 if not header_list: 469 header_list = default_header_list 470 471 # header as list 472 self.header_list = header_list 473 474 # header as VCF object 475 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 476 477 else: 478 479 self.header_list = None 480 self.header_vcf = None 481 482 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 483 """ 484 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 485 DataFrame based on the connection format. 486 487 :param query: The `query` parameter in the `get_query_to_df` function is a string that 488 represents the SQL query you want to execute. This query will be used to fetch data from a 489 database and convert it into a pandas DataFrame 490 :type query: str 491 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 492 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 493 function will only fetch up to that number of rows from the database query result. If no limit 494 is specified, 495 :type limit: int 496 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 497 """ 498 499 # Connexion format 500 connexion_format = self.get_connexion_format() 501 502 # Limit in query 503 if limit: 504 pd.set_option("display.max_rows", limit) 505 if connexion_format in ["duckdb"]: 506 df = ( 507 self.conn.execute(query) 508 .fetch_record_batch(limit) 509 .read_next_batch() 510 .to_pandas() 511 ) 512 elif connexion_format in ["sqlite"]: 513 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 514 515 # Full query 516 else: 517 if connexion_format in ["duckdb"]: 518 df = self.conn.execute(query).df() 519 elif connexion_format in ["sqlite"]: 520 df = pd.read_sql_query(query, self.conn) 521 522 return df 523 524 def get_overview(self) -> None: 525 """ 526 The function prints the input, output, config, and dataframe of the current object 527 """ 528 table_variants_from = self.get_table_variants(clause="from") 529 sql_columns = self.get_header_columns_as_sql() 530 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 531 df = self.get_query_to_df(sql_query_export) 532 log.info( 533 "Input: " 534 + str(self.get_input()) 535 + " [" 536 + str(str(self.get_input_format())) 537 + "]" 538 ) 539 log.info( 540 "Output: " 541 + str(self.get_output()) 542 + " [" 543 + str(str(self.get_output_format())) 544 + "]" 545 ) 546 log.info("Config: ") 547 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 548 "\n" 549 ): 550 log.info("\t" + str(d)) 551 log.info("Param: ") 552 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 553 "\n" 554 ): 555 log.info("\t" + str(d)) 556 log.info("Sample list: " + str(self.get_header_sample_list())) 557 log.info("Dataframe: ") 558 for d in str(df).split("\n"): 559 log.info("\t" + str(d)) 560 561 # garbage collector 562 del df 563 gc.collect() 564 565 return None 566 567 def get_stats(self) -> dict: 568 """ 569 The `get_stats` function calculates and returns various statistics of the current object, 570 including information about the input file, variants, samples, header fields, quality, and 571 SNVs/InDels. 572 :return: a dictionary containing various statistics of the current object. The dictionary has 573 the following structure: 574 """ 575 576 # Log 577 log.info(f"Stats Calculation...") 578 579 # table varaints 580 table_variants_from = self.get_table_variants() 581 582 # stats dict 583 stats = {"Infos": {}} 584 585 ### File 586 input_file = self.get_input() 587 stats["Infos"]["Input file"] = input_file 588 589 # Header 590 header_infos = self.get_header().infos 591 header_formats = self.get_header().formats 592 header_infos_list = list(header_infos) 593 header_formats_list = list(header_formats) 594 595 ### Variants 596 597 stats["Variants"] = {} 598 599 # Variants by chr 600 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 601 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 602 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 603 by=["CHROM"], kind="quicksort" 604 ) 605 606 # Total number of variants 607 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 608 609 # Calculate percentage 610 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 611 lambda x: (x / nb_of_variants) 612 ) 613 614 stats["Variants"]["Number of variants by chromosome"] = ( 615 nb_of_variants_by_chrom.to_dict(orient="index") 616 ) 617 618 stats["Infos"]["Number of variants"] = int(nb_of_variants) 619 620 ### Samples 621 622 # Init 623 samples = {} 624 nb_of_samples = 0 625 626 # Check Samples 627 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 628 log.debug(f"Check samples...") 629 for sample in self.get_header_sample_list(): 630 sql_query_samples = f""" 631 SELECT '{sample}' as sample, 632 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 633 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 634 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 635 FROM {table_variants_from} 636 WHERE ( 637 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 638 AND 639 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 640 ) 641 GROUP BY genotype 642 """ 643 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 644 sample_genotype_count = sql_query_genotype_df["count"].sum() 645 if len(sql_query_genotype_df): 646 nb_of_samples += 1 647 samples[f"{sample} - {sample_genotype_count} variants"] = ( 648 sql_query_genotype_df.to_dict(orient="index") 649 ) 650 651 stats["Samples"] = samples 652 stats["Infos"]["Number of samples"] = nb_of_samples 653 654 # # 655 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 656 # stats["Infos"]["Number of samples"] = nb_of_samples 657 # elif nb_of_samples: 658 # stats["Infos"]["Number of samples"] = "not a VCF format" 659 660 ### INFO and FORMAT fields 661 header_types_df = {} 662 header_types_list = { 663 "List of INFO fields": header_infos, 664 "List of FORMAT fields": header_formats, 665 } 666 i = 0 667 for header_type in header_types_list: 668 669 header_type_infos = header_types_list.get(header_type) 670 header_infos_dict = {} 671 672 for info in header_type_infos: 673 674 i += 1 675 header_infos_dict[i] = {} 676 677 # ID 678 header_infos_dict[i]["id"] = info 679 680 # num 681 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 682 if header_type_infos[info].num in genotype_map.keys(): 683 header_infos_dict[i]["Number"] = genotype_map.get( 684 header_type_infos[info].num 685 ) 686 else: 687 header_infos_dict[i]["Number"] = header_type_infos[info].num 688 689 # type 690 if header_type_infos[info].type: 691 header_infos_dict[i]["Type"] = header_type_infos[info].type 692 else: 693 header_infos_dict[i]["Type"] = "." 694 695 # desc 696 if header_type_infos[info].desc != None: 697 header_infos_dict[i]["Description"] = header_type_infos[info].desc 698 else: 699 header_infos_dict[i]["Description"] = "" 700 701 if len(header_infos_dict): 702 header_types_df[header_type] = pd.DataFrame.from_dict( 703 header_infos_dict, orient="index" 704 ).to_dict(orient="index") 705 706 # Stats 707 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 708 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 709 stats["Header"] = header_types_df 710 711 ### QUAL 712 if "QUAL" in self.get_header_columns(): 713 sql_query_qual = f""" 714 SELECT 715 avg(CAST(QUAL AS INTEGER)) AS Average, 716 min(CAST(QUAL AS INTEGER)) AS Minimum, 717 max(CAST(QUAL AS INTEGER)) AS Maximum, 718 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 719 median(CAST(QUAL AS INTEGER)) AS Median, 720 variance(CAST(QUAL AS INTEGER)) AS Variance 721 FROM {table_variants_from} 722 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 723 """ 724 725 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 726 stats["Quality"] = {"Stats": qual} 727 728 ### SNV and InDel 729 730 sql_query_snv = f""" 731 732 SELECT Type, count FROM ( 733 734 SELECT 735 'Total' AS Type, 736 count(*) AS count 737 FROM {table_variants_from} 738 739 UNION 740 741 SELECT 742 'MNV' AS Type, 743 count(*) AS count 744 FROM {table_variants_from} 745 WHERE len(REF) > 1 AND len(ALT) > 1 746 AND len(REF) = len(ALT) 747 748 UNION 749 750 SELECT 751 'InDel' AS Type, 752 count(*) AS count 753 FROM {table_variants_from} 754 WHERE len(REF) > 1 OR len(ALT) > 1 755 AND len(REF) != len(ALT) 756 757 UNION 758 759 SELECT 760 'SNV' AS Type, 761 count(*) AS count 762 FROM {table_variants_from} 763 WHERE len(REF) = 1 AND len(ALT) = 1 764 765 ) 766 767 ORDER BY count DESC 768 769 """ 770 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 771 772 sql_query_snv_substitution = f""" 773 SELECT 774 concat(REF, '>', ALT) AS 'Substitution', 775 count(*) AS count 776 FROM {table_variants_from} 777 WHERE len(REF) = 1 AND len(ALT) = 1 778 GROUP BY REF, ALT 779 ORDER BY count(*) DESC 780 """ 781 snv_substitution = ( 782 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 783 ) 784 stats["Variants"]["Counts"] = snv_indel 785 stats["Variants"]["Substitutions"] = snv_substitution 786 787 return stats 788 789 def stats_to_file(self, file: str = None) -> str: 790 """ 791 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 792 into a JSON object, and writes the JSON object to the specified file. 793 794 :param file: The `file` parameter is a string that represents the file path where the JSON data 795 will be written 796 :type file: str 797 :return: the name of the file that was written to. 798 """ 799 800 # Get stats 801 stats = self.get_stats() 802 803 # Serializing json 804 json_object = json.dumps(stats, indent=4) 805 806 # Writing to sample.json 807 with open(file, "w") as outfile: 808 outfile.write(json_object) 809 810 return file 811 812 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 813 """ 814 The `print_stats` function generates a markdown file and prints the statistics contained in a 815 JSON file in a formatted manner. 816 817 :param output_file: The `output_file` parameter is a string that specifies the path and filename 818 of the output file where the stats will be printed in Markdown format. If no `output_file` is 819 provided, a temporary directory will be created and the stats will be saved in a file named 820 "stats.md" within that 821 :type output_file: str 822 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 823 file where the statistics will be saved. If no value is provided, a temporary directory will be 824 created and a default file name "stats.json" will be used 825 :type json_file: str 826 :return: The function `print_stats` does not return any value. It has a return type annotation 827 of `None`. 828 """ 829 830 # Full path 831 output_file = full_path(output_file) 832 json_file = full_path(json_file) 833 834 with tempfile.TemporaryDirectory() as tmpdir: 835 836 # Files 837 if not output_file: 838 output_file = os.path.join(tmpdir, "stats.md") 839 if not json_file: 840 json_file = os.path.join(tmpdir, "stats.json") 841 842 # Create folders 843 if not os.path.exists(os.path.dirname(output_file)): 844 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 845 if not os.path.exists(os.path.dirname(json_file)): 846 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 847 848 # Create stats JSON file 849 stats_file = self.stats_to_file(file=json_file) 850 851 # Print stats file 852 with open(stats_file) as f: 853 stats = yaml.safe_load(f) 854 855 # Output 856 output_title = [] 857 output_index = [] 858 output = [] 859 860 # Title 861 output_title.append("# HOWARD Stats") 862 863 # Index 864 output_index.append("## Index") 865 866 # Process sections 867 for section in stats: 868 infos = stats.get(section) 869 section_link = "#" + section.lower().replace(" ", "-") 870 output.append(f"## {section}") 871 output_index.append(f"- [{section}]({section_link})") 872 873 if len(infos): 874 for info in infos: 875 try: 876 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 877 is_df = True 878 except: 879 try: 880 df = pd.DataFrame.from_dict( 881 json.loads((infos.get(info))), orient="index" 882 ) 883 is_df = True 884 except: 885 is_df = False 886 if is_df: 887 output.append(f"### {info}") 888 info_link = "#" + info.lower().replace(" ", "-") 889 output_index.append(f" - [{info}]({info_link})") 890 output.append(f"{df.to_markdown(index=False)}") 891 else: 892 output.append(f"- {info}: {infos.get(info)}") 893 else: 894 output.append(f"NA") 895 896 # Write stats in markdown file 897 with open(output_file, "w") as fp: 898 for item in output_title: 899 fp.write("%s\n" % item) 900 for item in output_index: 901 fp.write("%s\n" % item) 902 for item in output: 903 fp.write("%s\n" % item) 904 905 # Output stats in markdown 906 print("") 907 print("\n\n".join(output_title)) 908 print("") 909 print("\n\n".join(output)) 910 print("") 911 912 return None 913 914 def get_input(self) -> str: 915 """ 916 It returns the value of the input variable. 917 :return: The input is being returned. 918 """ 919 return self.input 920 921 def get_input_format(self, input_file: str = None) -> str: 922 """ 923 This function returns the format of the input variable, either from the provided input file or 924 by prompting for input. 925 926 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 927 represents the file path of the input file. If no `input_file` is provided when calling the 928 method, it will default to `None` 929 :type input_file: str 930 :return: The format of the input variable is being returned. 931 """ 932 933 if not input_file: 934 input_file = self.get_input() 935 input_format = get_file_format(input_file) 936 return input_format 937 938 def get_input_compressed(self, input_file: str = None) -> str: 939 """ 940 The function `get_input_compressed` returns the format of the input variable after compressing 941 it. 942 943 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 944 that represents the file path of the input file. If no `input_file` is provided when calling the 945 method, it will default to `None` and the method will then call `self.get_input()` to 946 :type input_file: str 947 :return: The function `get_input_compressed` returns the compressed format of the input 948 variable. 949 """ 950 951 if not input_file: 952 input_file = self.get_input() 953 input_compressed = get_file_compressed(input_file) 954 return input_compressed 955 956 def get_output(self) -> str: 957 """ 958 It returns the output of the neuron. 959 :return: The output of the neural network. 960 """ 961 962 return self.output 963 964 def get_output_format(self, output_file: str = None) -> str: 965 """ 966 The function `get_output_format` returns the format of the input variable or the output file if 967 provided. 968 969 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 970 that represents the file path of the output file. If no `output_file` is provided when calling 971 the method, it will default to the output obtained from the `get_output` method of the class 972 instance. The 973 :type output_file: str 974 :return: The format of the input variable is being returned. 975 """ 976 977 if not output_file: 978 output_file = self.get_output() 979 output_format = get_file_format(output_file) 980 981 return output_format 982 983 def get_config(self) -> dict: 984 """ 985 It returns the config 986 :return: The config variable is being returned. 987 """ 988 return self.config 989 990 def get_param(self) -> dict: 991 """ 992 It returns the param 993 :return: The param variable is being returned. 994 """ 995 return self.param 996 997 def get_connexion_db(self) -> str: 998 """ 999 It returns the connexion_db attribute of the object 1000 :return: The connexion_db is being returned. 1001 """ 1002 return self.connexion_db 1003 1004 def get_prefix(self) -> str: 1005 """ 1006 It returns the prefix of the object. 1007 :return: The prefix is being returned. 1008 """ 1009 return self.prefix 1010 1011 def get_table_variants(self, clause: str = "select") -> str: 1012 """ 1013 This function returns the table_variants attribute of the object 1014 1015 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1016 defaults to select (optional) 1017 :return: The table_variants attribute of the object. 1018 """ 1019 1020 # Access 1021 access = self.get_config().get("access", None) 1022 1023 # Clauses "select", "where", "update" 1024 if clause in ["select", "where", "update"]: 1025 table_variants = self.table_variants 1026 # Clause "from" 1027 elif clause in ["from"]: 1028 # For Read Only 1029 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1030 input_file = self.get_input() 1031 table_variants = f"'{input_file}' as variants" 1032 # For Read Write 1033 else: 1034 table_variants = f"{self.table_variants} as variants" 1035 else: 1036 table_variants = self.table_variants 1037 return table_variants 1038 1039 def get_tmp_dir(self) -> str: 1040 """ 1041 The function `get_tmp_dir` returns the temporary directory path based on configuration 1042 parameters or a default path. 1043 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1044 configuration, parameters, and a default value of "/tmp". 1045 """ 1046 1047 return get_tmp( 1048 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1049 ) 1050 1051 def get_connexion_type(self) -> str: 1052 """ 1053 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1054 1055 :return: The connexion type is being returned. 1056 """ 1057 return self.get_config().get("connexion_type", "memory") 1058 1059 def get_connexion(self): 1060 """ 1061 It returns the connection object 1062 1063 :return: The connection object. 1064 """ 1065 return self.conn 1066 1067 def close_connexion(self) -> None: 1068 """ 1069 This function closes the connection to the database. 1070 :return: The connection is being closed. 1071 """ 1072 return self.conn.close() 1073 1074 def get_header(self, type: str = "vcf"): 1075 """ 1076 This function returns the header of the VCF file as a list of strings 1077 1078 :param type: the type of header you want to get, defaults to vcf (optional) 1079 :return: The header of the vcf file. 1080 """ 1081 1082 if self.header_vcf: 1083 if type == "vcf": 1084 return self.header_vcf 1085 elif type == "list": 1086 return self.header_list 1087 else: 1088 if type == "vcf": 1089 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1090 return header 1091 elif type == "list": 1092 return vcf_required 1093 1094 def get_header_length(self, file: str = None) -> int: 1095 """ 1096 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1097 line. 1098 1099 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1100 header file. If this argument is provided, the function will read the header from the specified 1101 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1102 :type file: str 1103 :return: the length of the header list, excluding the #CHROM line. 1104 """ 1105 1106 if file: 1107 return len(self.read_vcf_header_file(file=file)) - 1 1108 elif self.get_header(type="list"): 1109 return len(self.get_header(type="list")) - 1 1110 else: 1111 return 0 1112 1113 def get_header_columns(self) -> str: 1114 """ 1115 This function returns the header list of a VCF 1116 1117 :return: The length of the header list. 1118 """ 1119 if self.get_header(): 1120 return self.get_header(type="list")[-1] 1121 else: 1122 return "" 1123 1124 def get_header_columns_as_list(self) -> list: 1125 """ 1126 This function returns the header list of a VCF 1127 1128 :return: The length of the header list. 1129 """ 1130 if self.get_header(): 1131 return self.get_header_columns().strip().split("\t") 1132 else: 1133 return [] 1134 1135 def get_header_columns_as_sql(self) -> str: 1136 """ 1137 This function retruns header length (without #CHROM line) 1138 1139 :return: The length of the header list. 1140 """ 1141 sql_column_list = [] 1142 for col in self.get_header_columns_as_list(): 1143 sql_column_list.append(f'"{col}"') 1144 return ",".join(sql_column_list) 1145 1146 def get_header_sample_list( 1147 self, check: bool = False, samples: list = None, samples_force: bool = False 1148 ) -> list: 1149 """ 1150 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1151 checking and filtering based on input parameters. 1152 1153 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1154 parameter that determines whether to check if the samples in the list are properly defined as 1155 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1156 list is defined as a, defaults to False 1157 :type check: bool (optional) 1158 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1159 allows you to specify a subset of samples from the header. If you provide a list of sample 1160 names, the function will check if each sample is defined in the header. If a sample is not found 1161 in the 1162 :type samples: list 1163 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1164 a boolean parameter that determines whether to force the function to return the sample list 1165 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1166 function will return the sample list without performing, defaults to False 1167 :type samples_force: bool (optional) 1168 :return: The function `get_header_sample_list` returns a list of samples based on the input 1169 parameters and conditions specified in the function. 1170 """ 1171 1172 # Init 1173 samples_list = [] 1174 1175 if samples is None: 1176 samples_list = self.header_vcf.samples 1177 else: 1178 samples_checked = [] 1179 for sample in samples: 1180 if sample in self.header_vcf.samples: 1181 samples_checked.append(sample) 1182 else: 1183 log.warning(f"Sample '{sample}' not defined in header") 1184 samples_list = samples_checked 1185 1186 # Force sample list without checking if is_genotype_column 1187 if samples_force: 1188 log.warning(f"Samples {samples_list} not checked if genotypes") 1189 return samples_list 1190 1191 if check: 1192 samples_checked = [] 1193 for sample in samples_list: 1194 if self.is_genotype_column(column=sample): 1195 samples_checked.append(sample) 1196 else: 1197 log.warning( 1198 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1199 ) 1200 samples_list = samples_checked 1201 1202 # Return samples list 1203 return samples_list 1204 1205 def is_genotype_column(self, column: str = None) -> bool: 1206 """ 1207 This function checks if a given column is a genotype column in a database. 1208 1209 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1210 represents the column name in a database table. This method checks if the specified column is a 1211 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1212 method of 1213 :type column: str 1214 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1215 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1216 column name and returns the result. If the `column` parameter is None, it returns False. 1217 """ 1218 1219 if column is not None: 1220 return Database(database=self.get_input()).is_genotype_column(column=column) 1221 else: 1222 return False 1223 1224 def get_verbose(self) -> bool: 1225 """ 1226 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1227 exist 1228 1229 :return: The value of the key "verbose" in the config dictionary. 1230 """ 1231 return self.get_config().get("verbose", False) 1232 1233 def get_connexion_format(self) -> str: 1234 """ 1235 It returns the connexion format of the object. 1236 :return: The connexion_format is being returned. 1237 """ 1238 connexion_format = self.connexion_format 1239 if connexion_format not in ["duckdb", "sqlite"]: 1240 log.error(f"Unknown connexion format {connexion_format}") 1241 raise ValueError(f"Unknown connexion format {connexion_format}") 1242 else: 1243 return connexion_format 1244 1245 def insert_file_to_table( 1246 self, 1247 file, 1248 columns: str, 1249 header_len: int = 0, 1250 sep: str = "\t", 1251 chunksize: int = 1000000, 1252 ) -> None: 1253 """ 1254 The function reads a file in chunks and inserts each chunk into a table based on the specified 1255 database format. 1256 1257 :param file: The `file` parameter is the file that you want to load into a table. It should be 1258 the path to the file on your system 1259 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1260 should contain the names of the columns in the table where the data will be inserted. The column 1261 names should be separated by commas within the string. For example, if you have columns named 1262 "id", "name 1263 :type columns: str 1264 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1265 the number of lines to skip at the beginning of the file before reading the actual data. This 1266 parameter allows you to skip any header information present in the file before processing the 1267 data, defaults to 0 1268 :type header_len: int (optional) 1269 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1270 separator character that is used in the file being read. In this case, the default separator is 1271 set to `\t`, which represents a tab character. You can change this parameter to a different 1272 separator character if, defaults to \t 1273 :type sep: str (optional) 1274 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1275 when processing the file in chunks. In the provided code snippet, the default value for 1276 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1277 to 1000000 1278 :type chunksize: int (optional) 1279 """ 1280 1281 # Config 1282 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1283 connexion_format = self.get_connexion_format() 1284 1285 log.debug("chunksize: " + str(chunksize)) 1286 1287 if chunksize: 1288 for chunk in pd.read_csv( 1289 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1290 ): 1291 if connexion_format in ["duckdb"]: 1292 sql_insert_into = ( 1293 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1294 ) 1295 self.conn.execute(sql_insert_into) 1296 elif connexion_format in ["sqlite"]: 1297 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1298 1299 def load_data( 1300 self, 1301 input_file: str = None, 1302 drop_variants_table: bool = False, 1303 sample_size: int = 20480, 1304 ) -> None: 1305 """ 1306 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1307 table before loading the data and specify a sample size. 1308 1309 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1310 table 1311 :type input_file: str 1312 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1313 determines whether the variants table should be dropped before loading the data. If set to 1314 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1315 not be dropped, defaults to False 1316 :type drop_variants_table: bool (optional) 1317 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1318 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1319 20480 1320 :type sample_size: int (optional) 1321 """ 1322 1323 log.info("Loading...") 1324 1325 # change input file 1326 if input_file: 1327 self.set_input(input_file) 1328 self.set_header() 1329 1330 # drop variants table 1331 if drop_variants_table: 1332 self.drop_variants_table() 1333 1334 # get table variants 1335 table_variants = self.get_table_variants() 1336 1337 # Access 1338 access = self.get_config().get("access", None) 1339 log.debug(f"access: {access}") 1340 1341 # Input format and compress 1342 input_format = self.get_input_format() 1343 input_compressed = self.get_input_compressed() 1344 log.debug(f"input_format: {input_format}") 1345 log.debug(f"input_compressed: {input_compressed}") 1346 1347 # input_compressed_format 1348 if input_compressed: 1349 input_compressed_format = "gzip" 1350 else: 1351 input_compressed_format = "none" 1352 log.debug(f"input_compressed_format: {input_compressed_format}") 1353 1354 # Connexion format 1355 connexion_format = self.get_connexion_format() 1356 1357 # Sample size 1358 if not sample_size: 1359 sample_size = -1 1360 log.debug(f"sample_size: {sample_size}") 1361 1362 # Load data 1363 log.debug(f"Load Data from {input_format}") 1364 1365 # DuckDB connexion 1366 if connexion_format in ["duckdb"]: 1367 1368 # Database already exists 1369 if self.input_format in ["db", "duckdb"]: 1370 1371 if connexion_format in ["duckdb"]: 1372 log.debug(f"Input file format '{self.input_format}' duckDB") 1373 else: 1374 log.error( 1375 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1376 ) 1377 raise ValueError( 1378 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1379 ) 1380 1381 # Load from existing database format 1382 else: 1383 1384 try: 1385 # Create Table or View 1386 database = Database(database=self.input) 1387 sql_from = database.get_sql_from(sample_size=sample_size) 1388 1389 if access in ["RO"]: 1390 sql_load = ( 1391 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1392 ) 1393 else: 1394 sql_load = ( 1395 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1396 ) 1397 self.conn.execute(sql_load) 1398 1399 except: 1400 # Format not available 1401 log.error(f"Input file format '{self.input_format}' not available") 1402 raise ValueError( 1403 f"Input file format '{self.input_format}' not available" 1404 ) 1405 1406 # SQLite connexion 1407 elif connexion_format in ["sqlite"] and input_format in [ 1408 "vcf", 1409 "tsv", 1410 "csv", 1411 "psv", 1412 ]: 1413 1414 # Main structure 1415 structure = { 1416 "#CHROM": "VARCHAR", 1417 "POS": "INTEGER", 1418 "ID": "VARCHAR", 1419 "REF": "VARCHAR", 1420 "ALT": "VARCHAR", 1421 "QUAL": "VARCHAR", 1422 "FILTER": "VARCHAR", 1423 "INFO": "VARCHAR", 1424 } 1425 1426 # Strcuture with samples 1427 structure_complete = structure 1428 if self.get_header_sample_list(): 1429 structure["FORMAT"] = "VARCHAR" 1430 for sample in self.get_header_sample_list(): 1431 structure_complete[sample] = "VARCHAR" 1432 1433 # Columns list for create and insert 1434 sql_create_table_columns = [] 1435 sql_create_table_columns_list = [] 1436 for column in structure_complete: 1437 column_type = structure_complete[column] 1438 sql_create_table_columns.append( 1439 f'"{column}" {column_type} default NULL' 1440 ) 1441 sql_create_table_columns_list.append(f'"{column}"') 1442 1443 # Create database 1444 log.debug(f"Create Table {table_variants}") 1445 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1446 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1447 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1448 self.conn.execute(sql_create_table) 1449 1450 # chunksize define length of file chunk load file 1451 chunksize = 100000 1452 1453 # delimiter 1454 delimiter = file_format_delimiters.get(input_format, "\t") 1455 1456 # Load the input file 1457 with open(self.input, "rt") as input_file: 1458 1459 # Use the appropriate file handler based on the input format 1460 if input_compressed: 1461 input_file = bgzf.open(self.input, "rt") 1462 if input_format in ["vcf"]: 1463 header_len = self.get_header_length() 1464 else: 1465 header_len = 0 1466 1467 # Insert the file contents into a table 1468 self.insert_file_to_table( 1469 input_file, 1470 columns=sql_create_table_columns_list_sql, 1471 header_len=header_len, 1472 sep=delimiter, 1473 chunksize=chunksize, 1474 ) 1475 1476 else: 1477 log.error( 1478 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1479 ) 1480 raise ValueError( 1481 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1482 ) 1483 1484 # Explode INFOS fields into table fields 1485 if self.get_explode_infos(): 1486 self.explode_infos( 1487 prefix=self.get_explode_infos_prefix(), 1488 fields=self.get_explode_infos_fields(), 1489 force=True, 1490 ) 1491 1492 # Create index after insertion 1493 self.create_indexes() 1494 1495 def get_explode_infos(self) -> bool: 1496 """ 1497 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1498 to False if it is not set. 1499 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1500 value. If the parameter is not present, it will return False. 1501 """ 1502 1503 return self.get_param().get("explode", {}).get("explode_infos", False) 1504 1505 def get_explode_infos_fields( 1506 self, 1507 explode_infos_fields: str = None, 1508 remove_fields_not_in_header: bool = False, 1509 ) -> list: 1510 """ 1511 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1512 the input parameter `explode_infos_fields`. 1513 1514 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1515 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1516 comma-separated list of field names to explode 1517 :type explode_infos_fields: str 1518 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1519 flag that determines whether to remove fields that are not present in the header. If it is set 1520 to `True`, any field that is not in the header will be excluded from the list of exploded 1521 information fields. If it is set to `, defaults to False 1522 :type remove_fields_not_in_header: bool (optional) 1523 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1524 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1525 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1526 Otherwise, it returns a list of exploded information fields after removing any spaces and 1527 splitting the string by commas. 1528 """ 1529 1530 # If no fields, get it in param 1531 if not explode_infos_fields: 1532 explode_infos_fields = ( 1533 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1534 ) 1535 1536 # If no fields, defined as all fields in header using keyword 1537 if not explode_infos_fields: 1538 explode_infos_fields = "*" 1539 1540 # If fields list not empty 1541 if explode_infos_fields: 1542 1543 # Input fields list 1544 if isinstance(explode_infos_fields, str): 1545 fields_input = explode_infos_fields.split(",") 1546 elif isinstance(explode_infos_fields, list): 1547 fields_input = explode_infos_fields 1548 else: 1549 fields_input = [] 1550 1551 # Fields list without * keyword 1552 fields_without_all = fields_input.copy() 1553 if "*".casefold() in (item.casefold() for item in fields_without_all): 1554 fields_without_all.remove("*") 1555 1556 # Fields in header 1557 fields_in_header = sorted(list(set(self.get_header().infos))) 1558 1559 # Construct list of fields 1560 fields_output = [] 1561 for field in fields_input: 1562 1563 # Strip field 1564 field = field.strip() 1565 1566 # format keyword * in regex 1567 if field.upper() in ["*"]: 1568 field = ".*" 1569 1570 # Find all fields with pattern 1571 r = re.compile(field) 1572 fields_search = sorted(list(filter(r.match, fields_in_header))) 1573 1574 # Remove fields input from search 1575 if field in fields_search: 1576 fields_search = [field] 1577 elif fields_search != [field]: 1578 fields_search = sorted( 1579 list(set(fields_search).difference(fields_input)) 1580 ) 1581 1582 # If field is not in header (avoid not well formatted header) 1583 if not fields_search and not remove_fields_not_in_header: 1584 fields_search = [field] 1585 1586 # Add found fields 1587 for new_field in fields_search: 1588 # Add field, if not already exists, and if it is in header (if asked) 1589 if ( 1590 new_field not in fields_output 1591 and ( 1592 not remove_fields_not_in_header 1593 or new_field in fields_in_header 1594 ) 1595 and new_field not in [".*"] 1596 ): 1597 fields_output.append(new_field) 1598 1599 return fields_output 1600 1601 else: 1602 1603 return [] 1604 1605 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1606 """ 1607 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1608 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1609 not provided. 1610 1611 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1612 prefix to be used for exploding or expanding information 1613 :type explode_infos_prefix: str 1614 :return: the value of the variable `explode_infos_prefix`. 1615 """ 1616 1617 if not explode_infos_prefix: 1618 explode_infos_prefix = ( 1619 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1620 ) 1621 1622 return explode_infos_prefix 1623 1624 def add_column( 1625 self, 1626 table_name, 1627 column_name, 1628 column_type, 1629 default_value=None, 1630 drop: bool = False, 1631 ) -> dict: 1632 """ 1633 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1634 doesn't already exist. 1635 1636 :param table_name: The name of the table to which you want to add a column 1637 :param column_name: The parameter "column_name" is the name of the column that you want to add 1638 to the table 1639 :param column_type: The `column_type` parameter specifies the data type of the column that you 1640 want to add to the table. It should be a string that represents the desired data type, such as 1641 "INTEGER", "TEXT", "REAL", etc 1642 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1643 default value for the newly added column. If a default value is provided, it will be assigned to 1644 the column for any existing rows that do not have a value for that column 1645 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1646 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1647 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1648 to False 1649 :type drop: bool (optional) 1650 :return: a boolean value indicating whether the column was successfully added to the table. 1651 """ 1652 1653 # added 1654 added = False 1655 dropped = False 1656 1657 # Check if the column already exists in the table 1658 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1659 columns = self.get_query_to_df(query).columns.tolist() 1660 if column_name.upper() in [c.upper() for c in columns]: 1661 log.debug( 1662 f"The {column_name} column already exists in the {table_name} table" 1663 ) 1664 if drop: 1665 self.drop_column(table_name=table_name, column_name=column_name) 1666 dropped = True 1667 else: 1668 return None 1669 else: 1670 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1671 1672 # Add column in table 1673 add_column_query = ( 1674 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1675 ) 1676 if default_value is not None: 1677 add_column_query += f" DEFAULT {default_value}" 1678 self.execute_query(add_column_query) 1679 added = not dropped 1680 log.debug( 1681 f"The {column_name} column was successfully added to the {table_name} table" 1682 ) 1683 1684 if added: 1685 added_column = { 1686 "table_name": table_name, 1687 "column_name": column_name, 1688 "column_type": column_type, 1689 "default_value": default_value, 1690 } 1691 else: 1692 added_column = None 1693 1694 return added_column 1695 1696 def drop_column( 1697 self, column: dict = None, table_name: str = None, column_name: str = None 1698 ) -> bool: 1699 """ 1700 The `drop_column` function drops a specified column from a given table in a database and returns 1701 True if the column was successfully dropped, and False if the column does not exist in the 1702 table. 1703 1704 :param column: The `column` parameter is a dictionary that contains information about the column 1705 you want to drop. It has two keys: 1706 :type column: dict 1707 :param table_name: The `table_name` parameter is the name of the table from which you want to 1708 drop a column 1709 :type table_name: str 1710 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1711 from the table 1712 :type column_name: str 1713 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1714 and False if the column does not exist in the table. 1715 """ 1716 1717 # Find column infos 1718 if column: 1719 if isinstance(column, dict): 1720 table_name = column.get("table_name", None) 1721 column_name = column.get("column_name", None) 1722 elif isinstance(column, str): 1723 table_name = self.get_table_variants() 1724 column_name = column 1725 else: 1726 table_name = None 1727 column_name = None 1728 1729 if not table_name and not column_name: 1730 return False 1731 1732 # Removed 1733 removed = False 1734 1735 # Check if the column already exists in the table 1736 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1737 columns = self.get_query_to_df(query).columns.tolist() 1738 if column_name in columns: 1739 log.debug(f"The {column_name} column exists in the {table_name} table") 1740 else: 1741 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1742 return False 1743 1744 # Add column in table # ALTER TABLE integers DROP k 1745 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1746 self.execute_query(add_column_query) 1747 removed = True 1748 log.debug( 1749 f"The {column_name} column was successfully dropped to the {table_name} table" 1750 ) 1751 1752 return removed 1753 1754 def explode_infos( 1755 self, 1756 prefix: str = None, 1757 create_index: bool = False, 1758 fields: list = None, 1759 force: bool = False, 1760 proccess_all_fields_together: bool = False, 1761 table: str = None, 1762 ) -> list: 1763 """ 1764 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1765 individual columns, returning a list of added columns. 1766 1767 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1768 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1769 `self.get_explode_infos_prefix()` as the prefix 1770 :type prefix: str 1771 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1772 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1773 `False`, indexes will not be created. The default value is `False`, defaults to False 1774 :type create_index: bool (optional) 1775 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1776 that you want to explode into individual columns. If this parameter is not provided, all INFO 1777 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1778 a list to the ` 1779 :type fields: list 1780 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1781 determines whether to drop and recreate a column if it already exists in the table. If `force` 1782 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1783 defaults to False 1784 :type force: bool (optional) 1785 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1786 flag that determines whether to process all the INFO fields together or individually. If set to 1787 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1788 be processed individually. The default value is, defaults to False 1789 :type proccess_all_fields_together: bool (optional) 1790 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1791 of the table where the exploded INFO fields will be added as individual columns. If you provide 1792 a value for the `table` parameter, the function will use that table name. If the `table` 1793 parameter is 1794 :type table: str 1795 :return: The `explode_infos` function returns a list of added columns. 1796 """ 1797 1798 # drop indexes 1799 self.drop_indexes() 1800 1801 # connexion format 1802 connexion_format = self.get_connexion_format() 1803 1804 # Access 1805 access = self.get_config().get("access", None) 1806 1807 # Added columns 1808 added_columns = [] 1809 1810 if access not in ["RO"]: 1811 1812 # prefix 1813 if prefix in [None, True] or not isinstance(prefix, str): 1814 if self.get_explode_infos_prefix() not in [None, True]: 1815 prefix = self.get_explode_infos_prefix() 1816 else: 1817 prefix = "INFO/" 1818 1819 # table variants 1820 if table is not None: 1821 table_variants = table 1822 else: 1823 table_variants = self.get_table_variants(clause="select") 1824 1825 # extra infos 1826 try: 1827 extra_infos = self.get_extra_infos() 1828 except: 1829 extra_infos = [] 1830 1831 # Header infos 1832 header_infos = self.get_header().infos 1833 1834 log.debug( 1835 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1836 ) 1837 1838 sql_info_alter_table_array = [] 1839 1840 # Info fields to check 1841 fields_list = list(header_infos) 1842 if fields: 1843 fields_list += fields 1844 fields_list = set(fields_list) 1845 1846 # If no fields 1847 if not fields: 1848 fields = [] 1849 1850 # Translate fields if patterns 1851 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1852 1853 for info in fields: 1854 1855 info_id_sql = prefix + info 1856 1857 if ( 1858 info in fields_list 1859 or prefix + info in fields_list 1860 or info in extra_infos 1861 ): 1862 1863 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1864 1865 if info in header_infos: 1866 info_type = header_infos[info].type 1867 info_num = header_infos[info].num 1868 else: 1869 info_type = "String" 1870 info_num = 0 1871 1872 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1873 if info_num != 1: 1874 type_sql = "VARCHAR" 1875 1876 # Add field 1877 added_column = self.add_column( 1878 table_name=table_variants, 1879 column_name=info_id_sql, 1880 column_type=type_sql, 1881 default_value="null", 1882 drop=force, 1883 ) 1884 1885 if added_column: 1886 added_columns.append(added_column) 1887 1888 if added_column or force: 1889 1890 # add field to index 1891 self.index_additionnal_fields.append(info_id_sql) 1892 1893 # Update field array 1894 if connexion_format in ["duckdb"]: 1895 update_info_field = f""" 1896 "{info_id_sql}" = 1897 CASE 1898 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1899 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1900 END 1901 """ 1902 elif connexion_format in ["sqlite"]: 1903 update_info_field = f""" 1904 "{info_id_sql}" = 1905 CASE 1906 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1907 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1908 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1909 END 1910 """ 1911 1912 sql_info_alter_table_array.append(update_info_field) 1913 1914 if sql_info_alter_table_array: 1915 1916 # By chromosomes 1917 try: 1918 chromosomes_list = list( 1919 self.get_query_to_df( 1920 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1921 )["#CHROM"] 1922 ) 1923 except: 1924 chromosomes_list = [None] 1925 1926 for chrom in chromosomes_list: 1927 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1928 1929 # Where clause 1930 where_clause = "" 1931 if chrom and len(chromosomes_list) > 1: 1932 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1933 1934 # Update table 1935 if proccess_all_fields_together: 1936 sql_info_alter_table_array_join = ", ".join( 1937 sql_info_alter_table_array 1938 ) 1939 if sql_info_alter_table_array_join: 1940 sql_info_alter_table = f""" 1941 UPDATE {table_variants} 1942 SET {sql_info_alter_table_array_join} 1943 {where_clause} 1944 """ 1945 log.debug( 1946 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1947 ) 1948 # log.debug(sql_info_alter_table) 1949 self.conn.execute(sql_info_alter_table) 1950 else: 1951 sql_info_alter_num = 0 1952 for sql_info_alter in sql_info_alter_table_array: 1953 sql_info_alter_num += 1 1954 sql_info_alter_table = f""" 1955 UPDATE {table_variants} 1956 SET {sql_info_alter} 1957 {where_clause} 1958 """ 1959 log.debug( 1960 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1961 ) 1962 # log.debug(sql_info_alter_table) 1963 self.conn.execute(sql_info_alter_table) 1964 1965 # create indexes 1966 if create_index: 1967 self.create_indexes() 1968 1969 return added_columns 1970 1971 def create_indexes(self) -> None: 1972 """ 1973 Create indexes on the table after insertion 1974 """ 1975 1976 # Access 1977 access = self.get_config().get("access", None) 1978 1979 # get table variants 1980 table_variants = self.get_table_variants("FROM") 1981 1982 if self.get_indexing() and access not in ["RO"]: 1983 # Create index 1984 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1985 self.conn.execute(sql_create_table_index) 1986 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1987 self.conn.execute(sql_create_table_index) 1988 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1989 self.conn.execute(sql_create_table_index) 1990 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1991 self.conn.execute(sql_create_table_index) 1992 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1993 self.conn.execute(sql_create_table_index) 1994 for field in self.index_additionnal_fields: 1995 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1996 self.conn.execute(sql_create_table_index) 1997 1998 def drop_indexes(self) -> None: 1999 """ 2000 Create indexes on the table after insertion 2001 """ 2002 2003 # Access 2004 access = self.get_config().get("access", None) 2005 2006 # get table variants 2007 table_variants = self.get_table_variants("FROM") 2008 2009 # Get database format 2010 connexion_format = self.get_connexion_format() 2011 2012 if access not in ["RO"]: 2013 if connexion_format in ["duckdb"]: 2014 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2015 elif connexion_format in ["sqlite"]: 2016 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2017 2018 list_indexes = self.conn.execute(sql_list_indexes) 2019 index_names = [row[0] for row in list_indexes.fetchall()] 2020 for index in index_names: 2021 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2022 self.conn.execute(sql_drop_table_index) 2023 2024 def read_vcf_header(self, f) -> list: 2025 """ 2026 It reads the header of a VCF file and returns a list of the header lines 2027 2028 :param f: the file object 2029 :return: The header lines of the VCF file. 2030 """ 2031 2032 header_list = [] 2033 for line in f: 2034 header_list.append(line) 2035 if line.startswith("#CHROM"): 2036 break 2037 return header_list 2038 2039 def read_vcf_header_file(self, file: str = None) -> list: 2040 """ 2041 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2042 uncompressed files. 2043 2044 :param file: The `file` parameter is a string that represents the path to the VCF header file 2045 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2046 default to `None` 2047 :type file: str 2048 :return: The function `read_vcf_header_file` returns a list. 2049 """ 2050 2051 if self.get_input_compressed(input_file=file): 2052 with bgzf.open(file, "rt") as f: 2053 return self.read_vcf_header(f=f) 2054 else: 2055 with open(file, "rt") as f: 2056 return self.read_vcf_header(f=f) 2057 2058 def execute_query(self, query: str): 2059 """ 2060 It takes a query as an argument, executes it, and returns the results 2061 2062 :param query: The query to be executed 2063 :return: The result of the query is being returned. 2064 """ 2065 if query: 2066 return self.conn.execute(query) # .fetchall() 2067 else: 2068 return None 2069 2070 def export_output( 2071 self, 2072 output_file: str | None = None, 2073 output_header: str | None = None, 2074 export_header: bool = True, 2075 query: str | None = None, 2076 parquet_partitions: list | None = None, 2077 chunk_size: int | None = None, 2078 threads: int | None = None, 2079 sort: bool = False, 2080 index: bool = False, 2081 order_by: str | None = None, 2082 ) -> bool: 2083 """ 2084 The `export_output` function exports data from a VCF file to a specified output file in various 2085 formats, including VCF, CSV, TSV, PSV, and Parquet. 2086 2087 :param output_file: The `output_file` parameter is a string that specifies the name of the 2088 output file to be generated by the function. This is where the exported data will be saved 2089 :type output_file: str 2090 :param output_header: The `output_header` parameter is a string that specifies the name of the 2091 file where the header of the VCF file will be exported. If this parameter is not provided, the 2092 header will be exported to a file with the same name as the `output_file` parameter, but with 2093 the extension " 2094 :type output_header: str 2095 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2096 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2097 True, the header will be exported to a file. If `export_header` is False, the header will not 2098 be, defaults to True, if output format is not VCF 2099 :type export_header: bool (optional) 2100 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2101 select specific data from the VCF file before exporting it. If provided, only the data that 2102 matches the query will be exported 2103 :type query: str 2104 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2105 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2106 organize data in a hierarchical directory structure based on the values of one or more columns. 2107 This can improve query performance when working with large datasets 2108 :type parquet_partitions: list 2109 :param chunk_size: The `chunk_size` parameter specifies the number of 2110 records in batch when exporting data in Parquet format. This parameter is used for 2111 partitioning the Parquet file into multiple files. 2112 :type chunk_size: int 2113 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2114 threads to be used during the export process. It determines the level of parallelism and can 2115 improve the performance of the export operation. If not provided, the function will use the 2116 default number of threads 2117 :type threads: int 2118 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2119 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2120 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2121 False 2122 :type sort: bool (optional) 2123 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2124 created on the output file. If `index` is True, an index will be created. If `index` is False, 2125 no index will be created. The default value is False, defaults to False 2126 :type index: bool (optional) 2127 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2128 sorting the output file. This parameter is only applicable when exporting data in VCF format 2129 :type order_by: str 2130 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2131 None if it doesn't. 2132 """ 2133 2134 # Log 2135 log.info("Exporting...") 2136 2137 # Full path 2138 output_file = full_path(output_file) 2139 output_header = full_path(output_header) 2140 2141 # Config 2142 config = self.get_config() 2143 2144 # Param 2145 param = self.get_param() 2146 2147 # Tmp files to remove 2148 tmp_to_remove = [] 2149 2150 # If no output, get it 2151 if not output_file: 2152 output_file = self.get_output() 2153 2154 # If not threads 2155 if not threads: 2156 threads = self.get_threads() 2157 2158 # Auto header name with extension 2159 if export_header or output_header: 2160 if not output_header: 2161 output_header = f"{output_file}.hdr" 2162 # Export header 2163 self.export_header(output_file=output_file) 2164 2165 # Switch off export header if VCF output 2166 output_file_type = get_file_format(output_file) 2167 if output_file_type in ["vcf"]: 2168 export_header = False 2169 tmp_to_remove.append(output_header) 2170 2171 # Chunk size 2172 if not chunk_size: 2173 chunk_size = config.get("chunk_size", None) 2174 2175 # Parquet partition 2176 if not parquet_partitions: 2177 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2178 if parquet_partitions and isinstance(parquet_partitions, str): 2179 parquet_partitions = parquet_partitions.split(",") 2180 2181 # Order by 2182 if not order_by: 2183 order_by = param.get("export", {}).get("order_by", "") 2184 2185 # Header in output 2186 header_in_output = param.get("export", {}).get("include_header", False) 2187 2188 # Database 2189 database_source = self.get_connexion() 2190 2191 # Connexion format 2192 connexion_format = self.get_connexion_format() 2193 2194 # Explode infos 2195 if self.get_explode_infos(): 2196 self.explode_infos( 2197 prefix=self.get_explode_infos_prefix(), 2198 fields=self.get_explode_infos_fields(), 2199 force=False, 2200 ) 2201 2202 # if connexion_format in ["sqlite"] or query: 2203 if connexion_format in ["sqlite"]: 2204 2205 # Export in Parquet 2206 random_tmp = "".join( 2207 random.choice(string.ascii_lowercase) for i in range(10) 2208 ) 2209 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2210 tmp_to_remove.append(database_source) 2211 2212 # Table Variants 2213 table_variants = self.get_table_variants() 2214 2215 # Create export query 2216 sql_query_export_subquery = f""" 2217 SELECT * FROM {table_variants} 2218 """ 2219 2220 # Write source file 2221 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2222 2223 # Create database 2224 database = Database( 2225 database=database_source, 2226 table="variants", 2227 header_file=output_header, 2228 conn_config=self.get_connexion_config(), 2229 ) 2230 2231 # Existing colomns header 2232 existing_columns_header = database.get_header_columns_from_database() 2233 2234 # Sample list 2235 get_samples = self.get_samples() 2236 get_samples_check = self.get_samples_check() 2237 samples_force = get_samples is not None 2238 sample_list = self.get_header_sample_list( 2239 check=get_samples_check, samples=get_samples, samples_force=samples_force 2240 ) 2241 2242 # Export file 2243 database.export( 2244 output_database=output_file, 2245 output_header=output_header, 2246 existing_columns_header=existing_columns_header, 2247 parquet_partitions=parquet_partitions, 2248 chunk_size=chunk_size, 2249 threads=threads, 2250 sort=sort, 2251 index=index, 2252 header_in_output=header_in_output, 2253 order_by=order_by, 2254 query=query, 2255 export_header=export_header, 2256 sample_list=sample_list, 2257 ) 2258 2259 # Remove 2260 remove_if_exists(tmp_to_remove) 2261 2262 return (os.path.exists(output_file) or None) and ( 2263 os.path.exists(output_file) or None 2264 ) 2265 2266 def get_extra_infos(self, table: str = None) -> list: 2267 """ 2268 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2269 in the header. 2270 2271 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2272 name of the table from which you want to retrieve the extra columns that are not present in the 2273 header. If the `table` parameter is not provided when calling the function, it will default to 2274 using the variants 2275 :type table: str 2276 :return: A list of columns that are in the specified table but not in the header of the table. 2277 """ 2278 2279 header_columns = [] 2280 2281 if not table: 2282 table = self.get_table_variants(clause="from") 2283 header_columns = self.get_header_columns() 2284 2285 # Check all columns in the database 2286 query = f""" SELECT * FROM {table} LIMIT 1 """ 2287 log.debug(f"query {query}") 2288 table_columns = self.get_query_to_df(query).columns.tolist() 2289 extra_columns = [] 2290 2291 # Construct extra infos (not in header) 2292 for column in table_columns: 2293 if column not in header_columns: 2294 extra_columns.append(column) 2295 2296 return extra_columns 2297 2298 def get_extra_infos_sql(self, table: str = None) -> str: 2299 """ 2300 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2301 by double quotes 2302 2303 :param table: The name of the table to get the extra infos from. If None, the default table is 2304 used 2305 :type table: str 2306 :return: A string of the extra infos 2307 """ 2308 2309 return ", ".join( 2310 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2311 ) 2312 2313 def export_header( 2314 self, 2315 header_name: str = None, 2316 output_file: str = None, 2317 output_file_ext: str = ".hdr", 2318 clean_header: bool = True, 2319 remove_chrom_line: bool = False, 2320 ) -> str: 2321 """ 2322 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2323 specified options, and writes it to a new file. 2324 2325 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2326 this parameter is not specified, the header will be written to the output file 2327 :type header_name: str 2328 :param output_file: The `output_file` parameter in the `export_header` function is used to 2329 specify the name of the output file where the header will be written. If this parameter is not 2330 provided, the header will be written to a temporary file 2331 :type output_file: str 2332 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2333 string that represents the extension of the output header file. By default, it is set to ".hdr" 2334 if not specified by the user. This extension will be appended to the `output_file` name to 2335 create the final, defaults to .hdr 2336 :type output_file_ext: str (optional) 2337 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2338 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2339 `True`, the function will clean the header by modifying certain lines based on a specific 2340 pattern. If `clean_header`, defaults to True 2341 :type clean_header: bool (optional) 2342 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2343 boolean flag that determines whether the #CHROM line should be removed from the header before 2344 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2345 defaults to False 2346 :type remove_chrom_line: bool (optional) 2347 :return: The function `export_header` returns the name of the temporary header file that is 2348 created. 2349 """ 2350 2351 if not header_name and not output_file: 2352 output_file = self.get_output() 2353 2354 if self.get_header(): 2355 2356 # Get header object 2357 header_obj = self.get_header() 2358 2359 # Create database 2360 db_for_header = Database(database=self.get_input()) 2361 2362 # Get real columns in the file 2363 db_header_columns = db_for_header.get_columns() 2364 2365 with tempfile.TemporaryDirectory() as tmpdir: 2366 2367 # Write header file 2368 header_file_tmp = os.path.join(tmpdir, "header") 2369 f = open(header_file_tmp, "w") 2370 vcf.Writer(f, header_obj) 2371 f.close() 2372 2373 # Replace #CHROM line with rel columns 2374 header_list = db_for_header.read_header_file( 2375 header_file=header_file_tmp 2376 ) 2377 header_list[-1] = "\t".join(db_header_columns) 2378 2379 # Remove CHROM line 2380 if remove_chrom_line: 2381 header_list.pop() 2382 2383 # Clean header 2384 if clean_header: 2385 header_list_clean = [] 2386 for head in header_list: 2387 # Clean head for malformed header 2388 head_clean = head 2389 head_clean = re.subn( 2390 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2391 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2392 head_clean, 2393 2, 2394 )[0] 2395 # Write header 2396 header_list_clean.append(head_clean) 2397 header_list = header_list_clean 2398 2399 tmp_header_name = output_file + output_file_ext 2400 2401 f = open(tmp_header_name, "w") 2402 for line in header_list: 2403 f.write(line) 2404 f.close() 2405 2406 return tmp_header_name 2407 2408 def export_variant_vcf( 2409 self, 2410 vcf_file, 2411 remove_info: bool = False, 2412 add_samples: bool = True, 2413 list_samples: list = [], 2414 where_clause: str = "", 2415 index: bool = False, 2416 threads: int | None = None, 2417 ) -> bool | None: 2418 """ 2419 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2420 remove INFO field, add samples, and control compression and indexing. 2421 2422 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2423 written to. It is the output file that will contain the filtered VCF data based on the specified 2424 parameters 2425 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2426 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2427 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2428 in, defaults to False 2429 :type remove_info: bool (optional) 2430 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2431 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2432 If set to False, the samples will be removed. The default value is True, defaults to True 2433 :type add_samples: bool (optional) 2434 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2435 in the output VCF file. By default, all samples will be included. If you provide a list of 2436 samples, only those samples will be included in the output file 2437 :type list_samples: list 2438 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2439 determines whether or not to create an index for the output VCF file. If `index` is set to 2440 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2441 :type index: bool (optional) 2442 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2443 number of threads to use for exporting the VCF file. It determines how many parallel threads 2444 will be used during the export process. More threads can potentially speed up the export process 2445 by utilizing multiple cores of the processor. If 2446 :type threads: int | None 2447 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2448 method with various parameters including the output file, query, threads, sort flag, and index 2449 flag. The `export_output` method is responsible for exporting the VCF data based on the 2450 specified parameters and configurations provided in the `export_variant_vcf` function. 2451 """ 2452 2453 # Config 2454 config = self.get_config() 2455 2456 # Extract VCF 2457 log.debug("Export VCF...") 2458 2459 # Table variants 2460 table_variants = self.get_table_variants() 2461 2462 # Threads 2463 if not threads: 2464 threads = self.get_threads() 2465 2466 # Info fields 2467 if remove_info: 2468 if not isinstance(remove_info, str): 2469 remove_info = "." 2470 info_field = f"""'{remove_info}' as INFO""" 2471 else: 2472 info_field = "INFO" 2473 2474 # Samples fields 2475 if add_samples: 2476 if not list_samples: 2477 list_samples = self.get_header_sample_list() 2478 if list_samples: 2479 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2480 else: 2481 samples_fields = "" 2482 log.debug(f"samples_fields: {samples_fields}") 2483 else: 2484 samples_fields = "" 2485 2486 # Where clause 2487 if where_clause is None: 2488 where_clause = "" 2489 2490 # Variants 2491 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2492 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2493 log.debug(f"sql_query_select={sql_query_select}") 2494 2495 return self.export_output( 2496 output_file=vcf_file, 2497 output_header=None, 2498 export_header=True, 2499 query=sql_query_select, 2500 parquet_partitions=None, 2501 chunk_size=config.get("chunk_size", None), 2502 threads=threads, 2503 sort=True, 2504 index=index, 2505 order_by=None, 2506 ) 2507 2508 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2509 """ 2510 It takes a list of commands and runs them in parallel using the number of threads specified 2511 2512 :param commands: A list of commands to run 2513 :param threads: The number of threads to use, defaults to 1 (optional) 2514 """ 2515 2516 run_parallel_commands(commands, threads) 2517 2518 def get_threads(self, default: int = 1) -> int: 2519 """ 2520 This function returns the number of threads to use for a job, with a default value of 1 if not 2521 specified. 2522 2523 :param default: The `default` parameter in the `get_threads` method is used to specify the 2524 default number of threads to use if no specific value is provided. If no value is provided for 2525 the `threads` parameter in the configuration or input parameters, the `default` value will be 2526 used, defaults to 1 2527 :type default: int (optional) 2528 :return: the number of threads to use for the current job. 2529 """ 2530 2531 # Config 2532 config = self.get_config() 2533 2534 # Param 2535 param = self.get_param() 2536 2537 # Input threads 2538 input_thread = param.get("threads", config.get("threads", None)) 2539 2540 # Check threads 2541 if not input_thread: 2542 threads = default 2543 elif int(input_thread) <= 0: 2544 threads = os.cpu_count() 2545 else: 2546 threads = int(input_thread) 2547 return threads 2548 2549 def get_memory(self, default: str = None) -> str: 2550 """ 2551 This function retrieves the memory value from parameters or configuration with a default value 2552 if not found. 2553 2554 :param default: The `get_memory` function takes in a default value as a string parameter. This 2555 default value is used as a fallback in case the `memory` parameter is not provided in the 2556 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2557 the function 2558 :type default: str 2559 :return: The `get_memory` function returns a string value representing the memory parameter. If 2560 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2561 return the default value provided as an argument to the function. 2562 """ 2563 2564 # Config 2565 config = self.get_config() 2566 2567 # Param 2568 param = self.get_param() 2569 2570 # Input threads 2571 input_memory = param.get("memory", config.get("memory", None)) 2572 2573 # Check threads 2574 if input_memory: 2575 memory = input_memory 2576 else: 2577 memory = default 2578 2579 return memory 2580 2581 def update_from_vcf(self, vcf_file: str) -> None: 2582 """ 2583 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2584 2585 :param vcf_file: the path to the VCF file 2586 """ 2587 2588 connexion_format = self.get_connexion_format() 2589 2590 if connexion_format in ["duckdb"]: 2591 self.update_from_vcf_duckdb(vcf_file) 2592 elif connexion_format in ["sqlite"]: 2593 self.update_from_vcf_sqlite(vcf_file) 2594 2595 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2596 """ 2597 It takes a VCF file and updates the INFO column of the variants table in the database with the 2598 INFO column of the VCF file 2599 2600 :param vcf_file: the path to the VCF file 2601 """ 2602 2603 # varaints table 2604 table_variants = self.get_table_variants() 2605 2606 # Loading VCF into temporaire table 2607 skip = self.get_header_length(file=vcf_file) 2608 vcf_df = pd.read_csv( 2609 vcf_file, 2610 sep="\t", 2611 engine="c", 2612 skiprows=skip, 2613 header=0, 2614 low_memory=False, 2615 ) 2616 sql_query_update = f""" 2617 UPDATE {table_variants} as table_variants 2618 SET INFO = concat( 2619 CASE 2620 WHEN INFO NOT IN ('', '.') 2621 THEN INFO 2622 ELSE '' 2623 END, 2624 ( 2625 SELECT 2626 concat( 2627 CASE 2628 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2629 THEN ';' 2630 ELSE '' 2631 END 2632 , 2633 CASE 2634 WHEN table_parquet.INFO NOT IN ('','.') 2635 THEN table_parquet.INFO 2636 ELSE '' 2637 END 2638 ) 2639 FROM vcf_df as table_parquet 2640 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2641 AND table_parquet.\"POS\" = table_variants.\"POS\" 2642 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2643 AND table_parquet.\"REF\" = table_variants.\"REF\" 2644 AND table_parquet.INFO NOT IN ('','.') 2645 ) 2646 ) 2647 ; 2648 """ 2649 self.conn.execute(sql_query_update) 2650 2651 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2652 """ 2653 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2654 table, then updates the INFO column of the variants table with the INFO column of the temporary 2655 table 2656 2657 :param vcf_file: The path to the VCF file you want to update the database with 2658 """ 2659 2660 # Create a temporary table for the VCF 2661 table_vcf = "tmp_vcf" 2662 sql_create = ( 2663 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2664 ) 2665 self.conn.execute(sql_create) 2666 2667 # Loading VCF into temporaire table 2668 vcf_df = pd.read_csv( 2669 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2670 ) 2671 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2672 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2673 2674 # Update table 'variants' with VCF data 2675 # warning: CONCAT as || operator 2676 sql_query_update = f""" 2677 UPDATE variants as table_variants 2678 SET INFO = CASE 2679 WHEN INFO NOT IN ('', '.') 2680 THEN INFO 2681 ELSE '' 2682 END || 2683 ( 2684 SELECT 2685 CASE 2686 WHEN table_variants.INFO NOT IN ('','.') 2687 AND table_vcf.INFO NOT IN ('','.') 2688 THEN ';' 2689 ELSE '' 2690 END || 2691 CASE 2692 WHEN table_vcf.INFO NOT IN ('','.') 2693 THEN table_vcf.INFO 2694 ELSE '' 2695 END 2696 FROM {table_vcf} as table_vcf 2697 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2698 AND table_vcf.\"POS\" = table_variants.\"POS\" 2699 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2700 AND table_vcf.\"REF\" = table_variants.\"REF\" 2701 ) 2702 """ 2703 self.conn.execute(sql_query_update) 2704 2705 # Drop temporary table 2706 sql_drop = f"DROP TABLE {table_vcf}" 2707 self.conn.execute(sql_drop) 2708 2709 def drop_variants_table(self) -> None: 2710 """ 2711 > This function drops the variants table 2712 """ 2713 2714 table_variants = self.get_table_variants() 2715 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2716 self.conn.execute(sql_table_variants) 2717 2718 def set_variant_id( 2719 self, variant_id_column: str = "variant_id", force: bool = None 2720 ) -> str: 2721 """ 2722 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2723 `#CHROM`, `POS`, `REF`, and `ALT` columns 2724 2725 :param variant_id_column: The name of the column to be created in the variants table, defaults 2726 to variant_id 2727 :type variant_id_column: str (optional) 2728 :param force: If True, the variant_id column will be created even if it already exists 2729 :type force: bool 2730 :return: The name of the column that contains the variant_id 2731 """ 2732 2733 # Assembly 2734 assembly = self.get_param().get( 2735 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2736 ) 2737 2738 # INFO/Tag prefix 2739 prefix = self.get_explode_infos_prefix() 2740 2741 # Explode INFO/SVTYPE 2742 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2743 2744 # variants table 2745 table_variants = self.get_table_variants() 2746 2747 # variant_id column 2748 if not variant_id_column: 2749 variant_id_column = "variant_id" 2750 2751 # Creta variant_id column 2752 if "variant_id" not in self.get_extra_infos() or force: 2753 2754 # Create column 2755 self.add_column( 2756 table_name=table_variants, 2757 column_name=variant_id_column, 2758 column_type="UBIGINT", 2759 default_value="0", 2760 ) 2761 2762 # Update column 2763 self.conn.execute( 2764 f""" 2765 UPDATE {table_variants} 2766 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2767 """ 2768 ) 2769 2770 # Remove added columns 2771 for added_column in added_columns: 2772 self.drop_column(column=added_column) 2773 2774 # return variant_id column name 2775 return variant_id_column 2776 2777 def get_variant_id_column( 2778 self, variant_id_column: str = "variant_id", force: bool = None 2779 ) -> str: 2780 """ 2781 This function returns the variant_id column name 2782 2783 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2784 defaults to variant_id 2785 :type variant_id_column: str (optional) 2786 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2787 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2788 if it is not already set, or if it is set 2789 :type force: bool 2790 :return: The variant_id column name. 2791 """ 2792 2793 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2794 2795 ### 2796 # Annotation 2797 ### 2798 2799 def scan_databases( 2800 self, 2801 database_formats: list = ["parquet"], 2802 database_releases: list = ["current"], 2803 ) -> dict: 2804 """ 2805 The function `scan_databases` scans for available databases based on specified formats and 2806 releases. 2807 2808 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2809 of the databases to be scanned. In this case, the accepted format is "parquet" 2810 :type database_formats: list ["parquet"] 2811 :param database_releases: The `database_releases` parameter is a list that specifies the 2812 releases of the databases to be scanned. In the provided function, the default value for 2813 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2814 databases that are in the "current" 2815 :type database_releases: list 2816 :return: The function `scan_databases` returns a dictionary containing information about 2817 databases that match the specified formats and releases. 2818 """ 2819 2820 # Config 2821 config = self.get_config() 2822 2823 # Param 2824 param = self.get_param() 2825 2826 # Param - Assembly 2827 assembly = param.get("assembly", config.get("assembly", None)) 2828 if not assembly: 2829 assembly = DEFAULT_ASSEMBLY 2830 log.warning(f"Default assembly '{assembly}'") 2831 2832 # Scan for availabled databases 2833 log.info( 2834 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2835 ) 2836 databases_infos_dict = databases_infos( 2837 database_folder_releases=database_releases, 2838 database_formats=database_formats, 2839 assembly=assembly, 2840 config=config, 2841 ) 2842 log.info( 2843 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2844 ) 2845 2846 return databases_infos_dict 2847 2848 def annotation(self) -> None: 2849 """ 2850 It annotates the VCF file with the annotations specified in the config file. 2851 """ 2852 2853 # Config 2854 config = self.get_config() 2855 2856 # Param 2857 param = self.get_param() 2858 2859 # Param - Assembly 2860 assembly = param.get("assembly", config.get("assembly", None)) 2861 if not assembly: 2862 assembly = DEFAULT_ASSEMBLY 2863 log.warning(f"Default assembly '{assembly}'") 2864 2865 # annotations databases folders 2866 annotations_databases = set( 2867 config.get("folders", {}) 2868 .get("databases", {}) 2869 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2870 + config.get("folders", {}) 2871 .get("databases", {}) 2872 .get("parquet", ["~/howard/databases/parquet/current"]) 2873 + config.get("folders", {}) 2874 .get("databases", {}) 2875 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2876 ) 2877 2878 # Get param annotations 2879 if param.get("annotations", None) and isinstance( 2880 param.get("annotations", None), str 2881 ): 2882 log.debug(param.get("annotations", None)) 2883 param_annotation_list = param.get("annotations").split(",") 2884 else: 2885 param_annotation_list = [] 2886 2887 # Each tools param 2888 if param.get("annotation_parquet", None) != None: 2889 log.debug( 2890 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2891 ) 2892 if isinstance(param.get("annotation_parquet", None), list): 2893 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2894 else: 2895 param_annotation_list.append(param.get("annotation_parquet")) 2896 if param.get("annotation_snpsift", None) != None: 2897 if isinstance(param.get("annotation_snpsift", None), list): 2898 param_annotation_list.append( 2899 "snpsift:" 2900 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2901 ) 2902 else: 2903 param_annotation_list.append( 2904 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2905 ) 2906 if param.get("annotation_snpeff", None) != None: 2907 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2908 if param.get("annotation_bcftools", None) != None: 2909 if isinstance(param.get("annotation_bcftools", None), list): 2910 param_annotation_list.append( 2911 "bcftools:" 2912 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2913 ) 2914 else: 2915 param_annotation_list.append( 2916 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2917 ) 2918 if param.get("annotation_annovar", None) != None: 2919 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2920 if param.get("annotation_exomiser", None) != None: 2921 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2922 if param.get("annotation_splice", None) != None: 2923 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2924 2925 # Merge param annotations list 2926 param["annotations"] = ",".join(param_annotation_list) 2927 2928 # debug 2929 log.debug(f"param_annotations={param['annotations']}") 2930 2931 if param.get("annotations"): 2932 2933 # Log 2934 # log.info("Annotations - Check annotation parameters") 2935 2936 if not "annotation" in param: 2937 param["annotation"] = {} 2938 2939 # List of annotations parameters 2940 annotations_list_input = {} 2941 if isinstance(param.get("annotations", None), str): 2942 annotation_file_list = [ 2943 value for value in param.get("annotations", "").split(",") 2944 ] 2945 for annotation_file in annotation_file_list: 2946 annotations_list_input[annotation_file] = {"INFO": None} 2947 else: 2948 annotations_list_input = param.get("annotations", {}) 2949 2950 log.info(f"Quick Annotations:") 2951 for annotation_key in list(annotations_list_input.keys()): 2952 log.info(f" {annotation_key}") 2953 2954 # List of annotations and associated fields 2955 annotations_list = {} 2956 2957 for annotation_file in annotations_list_input: 2958 2959 # Explode annotations if ALL 2960 if ( 2961 annotation_file.upper() == "ALL" 2962 or annotation_file.upper().startswith("ALL:") 2963 ): 2964 2965 # check ALL parameters (formats, releases) 2966 annotation_file_split = annotation_file.split(":") 2967 database_formats = "parquet" 2968 database_releases = "current" 2969 for annotation_file_option in annotation_file_split[1:]: 2970 database_all_options_split = annotation_file_option.split("=") 2971 if database_all_options_split[0] == "format": 2972 database_formats = database_all_options_split[1].split("+") 2973 if database_all_options_split[0] == "release": 2974 database_releases = database_all_options_split[1].split("+") 2975 2976 # Scan for availabled databases 2977 databases_infos_dict = self.scan_databases( 2978 database_formats=database_formats, 2979 database_releases=database_releases, 2980 ) 2981 2982 # Add found databases in annotation parameters 2983 for database_infos in databases_infos_dict.keys(): 2984 annotations_list[database_infos] = {"INFO": None} 2985 2986 else: 2987 annotations_list[annotation_file] = annotations_list_input[ 2988 annotation_file 2989 ] 2990 2991 # Check each databases 2992 if len(annotations_list): 2993 2994 log.info( 2995 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2996 ) 2997 2998 for annotation_file in annotations_list: 2999 3000 # Init 3001 annotations = annotations_list.get(annotation_file, None) 3002 3003 # Annotation snpEff 3004 if annotation_file.startswith("snpeff"): 3005 3006 log.debug(f"Quick Annotation snpEff") 3007 3008 if "snpeff" not in param["annotation"]: 3009 param["annotation"]["snpeff"] = {} 3010 3011 if "options" not in param["annotation"]["snpeff"]: 3012 param["annotation"]["snpeff"]["options"] = "" 3013 3014 # snpEff options in annotations 3015 param["annotation"]["snpeff"]["options"] = "".join( 3016 annotation_file.split(":")[1:] 3017 ) 3018 3019 # Annotation Annovar 3020 elif annotation_file.startswith("annovar"): 3021 3022 log.debug(f"Quick Annotation Annovar") 3023 3024 if "annovar" not in param["annotation"]: 3025 param["annotation"]["annovar"] = {} 3026 3027 if "annotations" not in param["annotation"]["annovar"]: 3028 param["annotation"]["annovar"]["annotations"] = {} 3029 3030 # Options 3031 annotation_file_split = annotation_file.split(":") 3032 for annotation_file_annotation in annotation_file_split[1:]: 3033 if annotation_file_annotation: 3034 param["annotation"]["annovar"]["annotations"][ 3035 annotation_file_annotation 3036 ] = annotations 3037 3038 # Annotation Exomiser 3039 elif annotation_file.startswith("exomiser"): 3040 3041 log.debug(f"Quick Annotation Exomiser") 3042 3043 param["annotation"]["exomiser"] = params_string_to_dict( 3044 annotation_file 3045 ) 3046 3047 # Annotation Splice 3048 elif annotation_file.startswith("splice"): 3049 3050 log.debug(f"Quick Annotation Splice") 3051 3052 param["annotation"]["splice"] = params_string_to_dict( 3053 annotation_file 3054 ) 3055 3056 # Annotation Parquet or BCFTOOLS 3057 else: 3058 3059 # Tools detection 3060 if annotation_file.startswith("bcftools:"): 3061 annotation_tool_initial = "bcftools" 3062 annotation_file = ":".join(annotation_file.split(":")[1:]) 3063 elif annotation_file.startswith("snpsift:"): 3064 annotation_tool_initial = "snpsift" 3065 annotation_file = ":".join(annotation_file.split(":")[1:]) 3066 else: 3067 annotation_tool_initial = None 3068 3069 # list of files 3070 annotation_file_list = annotation_file.replace("+", ":").split( 3071 ":" 3072 ) 3073 3074 for annotation_file in annotation_file_list: 3075 3076 if annotation_file: 3077 3078 # Annotation tool initial 3079 annotation_tool = annotation_tool_initial 3080 3081 # Find file 3082 annotation_file_found = None 3083 3084 # Expand user 3085 annotation_file = full_path(annotation_file) 3086 3087 if os.path.exists(annotation_file): 3088 annotation_file_found = annotation_file 3089 3090 else: 3091 # Find within assembly folders 3092 for annotations_database in annotations_databases: 3093 found_files = find_all( 3094 annotation_file, 3095 os.path.join( 3096 annotations_database, assembly 3097 ), 3098 ) 3099 if len(found_files) > 0: 3100 annotation_file_found = found_files[0] 3101 break 3102 if not annotation_file_found and not assembly: 3103 # Find within folders 3104 for ( 3105 annotations_database 3106 ) in annotations_databases: 3107 found_files = find_all( 3108 annotation_file, annotations_database 3109 ) 3110 if len(found_files) > 0: 3111 annotation_file_found = found_files[0] 3112 break 3113 log.debug( 3114 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3115 ) 3116 3117 # Full path 3118 annotation_file_found = full_path(annotation_file_found) 3119 3120 if annotation_file_found: 3121 3122 database = Database(database=annotation_file_found) 3123 quick_annotation_format = database.get_format() 3124 quick_annotation_is_compressed = ( 3125 database.is_compressed() 3126 ) 3127 quick_annotation_is_indexed = os.path.exists( 3128 f"{annotation_file_found}.tbi" 3129 ) 3130 bcftools_preference = False 3131 3132 # Check Annotation Tool 3133 if not annotation_tool: 3134 if ( 3135 bcftools_preference 3136 and quick_annotation_format 3137 in ["vcf", "bed"] 3138 and quick_annotation_is_compressed 3139 and quick_annotation_is_indexed 3140 ): 3141 annotation_tool = "bcftools" 3142 elif quick_annotation_format in [ 3143 "vcf", 3144 "bed", 3145 "tsv", 3146 "tsv", 3147 "csv", 3148 "json", 3149 "tbl", 3150 "parquet", 3151 "duckdb", 3152 ]: 3153 annotation_tool = "parquet" 3154 else: 3155 log.error( 3156 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3157 ) 3158 raise ValueError( 3159 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3160 ) 3161 3162 log.debug( 3163 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3164 ) 3165 3166 # Annotation Tool dispatch 3167 if annotation_tool: 3168 if annotation_tool not in param["annotation"]: 3169 param["annotation"][annotation_tool] = {} 3170 if ( 3171 "annotations" 3172 not in param["annotation"][annotation_tool] 3173 ): 3174 param["annotation"][annotation_tool][ 3175 "annotations" 3176 ] = {} 3177 param["annotation"][annotation_tool][ 3178 "annotations" 3179 ][annotation_file_found] = annotations 3180 3181 else: 3182 log.error( 3183 f"Quick Annotation File {annotation_file} does NOT exist" 3184 ) 3185 3186 self.set_param(param) 3187 3188 if param.get("annotation", None): 3189 log.info("Annotations") 3190 if param.get("annotation", {}).get("parquet", None): 3191 log.info("Annotations 'parquet'...") 3192 self.annotation_parquet() 3193 if param.get("annotation", {}).get("bcftools", None): 3194 log.info("Annotations 'bcftools'...") 3195 self.annotation_bcftools() 3196 if param.get("annotation", {}).get("snpsift", None): 3197 log.info("Annotations 'snpsift'...") 3198 self.annotation_snpsift() 3199 if param.get("annotation", {}).get("annovar", None): 3200 log.info("Annotations 'annovar'...") 3201 self.annotation_annovar() 3202 if param.get("annotation", {}).get("snpeff", None): 3203 log.info("Annotations 'snpeff'...") 3204 self.annotation_snpeff() 3205 if param.get("annotation", {}).get("exomiser", None) is not None: 3206 log.info("Annotations 'exomiser'...") 3207 self.annotation_exomiser() 3208 if param.get("annotation", {}).get("splice", None) is not None: 3209 log.info("Annotations 'splice' ...") 3210 self.annotation_splice() 3211 3212 # Explode INFOS fields into table fields 3213 if self.get_explode_infos(): 3214 self.explode_infos( 3215 prefix=self.get_explode_infos_prefix(), 3216 fields=self.get_explode_infos_fields(), 3217 force=True, 3218 ) 3219 3220 def annotation_snpsift(self, threads: int = None) -> None: 3221 """ 3222 This function annotate with bcftools 3223 3224 :param threads: Number of threads to use 3225 :return: the value of the variable "return_value". 3226 """ 3227 3228 # DEBUG 3229 log.debug("Start annotation with bcftools databases") 3230 3231 # Threads 3232 if not threads: 3233 threads = self.get_threads() 3234 log.debug("Threads: " + str(threads)) 3235 3236 # Config 3237 config = self.get_config() 3238 log.debug("Config: " + str(config)) 3239 3240 # Config - snpSift 3241 snpsift_bin_command = get_bin_command( 3242 bin="SnpSift.jar", 3243 tool="snpsift", 3244 bin_type="jar", 3245 config=config, 3246 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3247 ) 3248 if not snpsift_bin_command: 3249 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3250 log.error(msg_err) 3251 raise ValueError(msg_err) 3252 3253 # Config - bcftools 3254 bcftools_bin_command = get_bin_command( 3255 bin="bcftools", 3256 tool="bcftools", 3257 bin_type="bin", 3258 config=config, 3259 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3260 ) 3261 if not bcftools_bin_command: 3262 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3263 log.error(msg_err) 3264 raise ValueError(msg_err) 3265 3266 # Config - BCFTools databases folders 3267 databases_folders = set( 3268 self.get_config() 3269 .get("folders", {}) 3270 .get("databases", {}) 3271 .get("annotations", ["."]) 3272 + self.get_config() 3273 .get("folders", {}) 3274 .get("databases", {}) 3275 .get("bcftools", ["."]) 3276 ) 3277 log.debug("Databases annotations: " + str(databases_folders)) 3278 3279 # Param 3280 annotations = ( 3281 self.get_param() 3282 .get("annotation", {}) 3283 .get("snpsift", {}) 3284 .get("annotations", None) 3285 ) 3286 log.debug("Annotations: " + str(annotations)) 3287 3288 # Assembly 3289 assembly = self.get_param().get( 3290 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3291 ) 3292 3293 # Data 3294 table_variants = self.get_table_variants() 3295 3296 # Check if not empty 3297 log.debug("Check if not empty") 3298 sql_query_chromosomes = ( 3299 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3300 ) 3301 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3302 if not sql_query_chromosomes_df["count"][0]: 3303 log.info(f"VCF empty") 3304 return 3305 3306 # VCF header 3307 vcf_reader = self.get_header() 3308 log.debug("Initial header: " + str(vcf_reader.infos)) 3309 3310 # Existing annotations 3311 for vcf_annotation in self.get_header().infos: 3312 3313 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3314 log.debug( 3315 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3316 ) 3317 3318 if annotations: 3319 3320 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3321 3322 # Export VCF file 3323 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3324 3325 # Init 3326 commands = {} 3327 3328 for annotation in annotations: 3329 annotation_fields = annotations[annotation] 3330 3331 # Annotation Name 3332 annotation_name = os.path.basename(annotation) 3333 3334 if not annotation_fields: 3335 annotation_fields = {"INFO": None} 3336 3337 log.debug(f"Annotation '{annotation_name}'") 3338 log.debug( 3339 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3340 ) 3341 3342 # Create Database 3343 database = Database( 3344 database=annotation, 3345 databases_folders=databases_folders, 3346 assembly=assembly, 3347 ) 3348 3349 # Find files 3350 db_file = database.get_database() 3351 db_file = full_path(db_file) 3352 db_hdr_file = database.get_header_file() 3353 db_hdr_file = full_path(db_hdr_file) 3354 db_file_type = database.get_format() 3355 db_tbi_file = f"{db_file}.tbi" 3356 db_file_compressed = database.is_compressed() 3357 3358 # Check if compressed 3359 if not db_file_compressed: 3360 log.error( 3361 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3362 ) 3363 raise ValueError( 3364 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3365 ) 3366 3367 # Check if indexed 3368 if not os.path.exists(db_tbi_file): 3369 log.error( 3370 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3371 ) 3372 raise ValueError( 3373 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3374 ) 3375 3376 # Check index - try to create if not exists 3377 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3378 log.error("Annotation failed: database not valid") 3379 log.error(f"Annotation annotation file: {db_file}") 3380 log.error(f"Annotation annotation header: {db_hdr_file}") 3381 log.error(f"Annotation annotation index: {db_tbi_file}") 3382 raise ValueError( 3383 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3384 ) 3385 else: 3386 3387 log.debug( 3388 f"Annotation '{annotation}' - file: " 3389 + str(db_file) 3390 + " and " 3391 + str(db_hdr_file) 3392 ) 3393 3394 # Load header as VCF object 3395 db_hdr_vcf = Variants(input=db_hdr_file) 3396 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3397 log.debug( 3398 "Annotation database header: " 3399 + str(db_hdr_vcf_header_infos) 3400 ) 3401 3402 # For all fields in database 3403 annotation_fields_full = False 3404 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3405 annotation_fields = { 3406 key: key for key in db_hdr_vcf_header_infos 3407 } 3408 log.debug( 3409 "Annotation database header - All annotations added: " 3410 + str(annotation_fields) 3411 ) 3412 annotation_fields_full = True 3413 3414 # # Create file for field rename 3415 # log.debug("Create file for field rename") 3416 # tmp_rename = NamedTemporaryFile( 3417 # prefix=self.get_prefix(), 3418 # dir=self.get_tmp_dir(), 3419 # suffix=".rename", 3420 # delete=False, 3421 # ) 3422 # tmp_rename_name = tmp_rename.name 3423 # tmp_files.append(tmp_rename_name) 3424 3425 # Number of fields 3426 nb_annotation_field = 0 3427 annotation_list = [] 3428 annotation_infos_rename_list = [] 3429 3430 for annotation_field in annotation_fields: 3431 3432 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3433 annotation_fields_new_name = annotation_fields.get( 3434 annotation_field, annotation_field 3435 ) 3436 if not annotation_fields_new_name: 3437 annotation_fields_new_name = annotation_field 3438 3439 # Check if field is in DB and if field is not elready in input data 3440 if ( 3441 annotation_field in db_hdr_vcf.get_header().infos 3442 and annotation_fields_new_name 3443 not in self.get_header().infos 3444 ): 3445 3446 log.info( 3447 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3448 ) 3449 3450 # BCFTools annotate param to rename fields 3451 if annotation_field != annotation_fields_new_name: 3452 annotation_infos_rename_list.append( 3453 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3454 ) 3455 3456 # Add INFO field to header 3457 db_hdr_vcf_header_infos_number = ( 3458 db_hdr_vcf_header_infos[annotation_field].num or "." 3459 ) 3460 db_hdr_vcf_header_infos_type = ( 3461 db_hdr_vcf_header_infos[annotation_field].type 3462 or "String" 3463 ) 3464 db_hdr_vcf_header_infos_description = ( 3465 db_hdr_vcf_header_infos[annotation_field].desc 3466 or f"{annotation_field} description" 3467 ) 3468 db_hdr_vcf_header_infos_source = ( 3469 db_hdr_vcf_header_infos[annotation_field].source 3470 or "unknown" 3471 ) 3472 db_hdr_vcf_header_infos_version = ( 3473 db_hdr_vcf_header_infos[annotation_field].version 3474 or "unknown" 3475 ) 3476 3477 vcf_reader.infos[annotation_fields_new_name] = ( 3478 vcf.parser._Info( 3479 annotation_fields_new_name, 3480 db_hdr_vcf_header_infos_number, 3481 db_hdr_vcf_header_infos_type, 3482 db_hdr_vcf_header_infos_description, 3483 db_hdr_vcf_header_infos_source, 3484 db_hdr_vcf_header_infos_version, 3485 self.code_type_map[ 3486 db_hdr_vcf_header_infos_type 3487 ], 3488 ) 3489 ) 3490 3491 annotation_list.append(annotation_field) 3492 3493 nb_annotation_field += 1 3494 3495 else: 3496 3497 if ( 3498 annotation_field 3499 not in db_hdr_vcf.get_header().infos 3500 ): 3501 log.warning( 3502 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3503 ) 3504 if ( 3505 annotation_fields_new_name 3506 in self.get_header().infos 3507 ): 3508 log.warning( 3509 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3510 ) 3511 3512 log.info( 3513 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3514 ) 3515 3516 annotation_infos = ",".join(annotation_list) 3517 3518 if annotation_infos != "": 3519 3520 # Annotated VCF (and error file) 3521 tmp_annotation_vcf_name = os.path.join( 3522 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3523 ) 3524 tmp_annotation_vcf_name_err = ( 3525 tmp_annotation_vcf_name + ".err" 3526 ) 3527 3528 # Add fields to annotate 3529 if not annotation_fields_full: 3530 annotation_infos_option = f"-info {annotation_infos}" 3531 else: 3532 annotation_infos_option = "" 3533 3534 # Info fields rename 3535 if annotation_infos_rename_list: 3536 annotation_infos_rename = " -c " + ",".join( 3537 annotation_infos_rename_list 3538 ) 3539 else: 3540 annotation_infos_rename = "" 3541 3542 # Annotate command 3543 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3544 3545 # Add command 3546 commands[command_annotate] = tmp_annotation_vcf_name 3547 3548 if commands: 3549 3550 # Export VCF file 3551 self.export_variant_vcf( 3552 vcf_file=tmp_vcf_name, 3553 remove_info=True, 3554 add_samples=False, 3555 index=True, 3556 ) 3557 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3558 3559 # Num command 3560 nb_command = 0 3561 3562 # Annotate 3563 for command_annotate in commands: 3564 nb_command += 1 3565 log.info( 3566 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3567 ) 3568 log.debug(f"command_annotate={command_annotate}") 3569 run_parallel_commands([command_annotate], threads) 3570 3571 # Debug 3572 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3573 3574 # Update variants 3575 log.info( 3576 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3577 ) 3578 self.update_from_vcf(commands[command_annotate]) 3579 3580 def annotation_bcftools(self, threads: int = None) -> None: 3581 """ 3582 This function annotate with bcftools 3583 3584 :param threads: Number of threads to use 3585 :return: the value of the variable "return_value". 3586 """ 3587 3588 # DEBUG 3589 log.debug("Start annotation with bcftools databases") 3590 3591 # Threads 3592 if not threads: 3593 threads = self.get_threads() 3594 log.debug("Threads: " + str(threads)) 3595 3596 # Config 3597 config = self.get_config() 3598 log.debug("Config: " + str(config)) 3599 3600 # DEBUG 3601 delete_tmp = True 3602 if self.get_config().get("verbosity", "warning") in ["debug"]: 3603 delete_tmp = False 3604 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3605 3606 # Config - BCFTools bin command 3607 bcftools_bin_command = get_bin_command( 3608 bin="bcftools", 3609 tool="bcftools", 3610 bin_type="bin", 3611 config=config, 3612 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3613 ) 3614 if not bcftools_bin_command: 3615 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3616 log.error(msg_err) 3617 raise ValueError(msg_err) 3618 3619 # Config - BCFTools databases folders 3620 databases_folders = set( 3621 self.get_config() 3622 .get("folders", {}) 3623 .get("databases", {}) 3624 .get("annotations", ["."]) 3625 + self.get_config() 3626 .get("folders", {}) 3627 .get("databases", {}) 3628 .get("bcftools", ["."]) 3629 ) 3630 log.debug("Databases annotations: " + str(databases_folders)) 3631 3632 # Param 3633 annotations = ( 3634 self.get_param() 3635 .get("annotation", {}) 3636 .get("bcftools", {}) 3637 .get("annotations", None) 3638 ) 3639 log.debug("Annotations: " + str(annotations)) 3640 3641 # Assembly 3642 assembly = self.get_param().get( 3643 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3644 ) 3645 3646 # Data 3647 table_variants = self.get_table_variants() 3648 3649 # Check if not empty 3650 log.debug("Check if not empty") 3651 sql_query_chromosomes = ( 3652 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3653 ) 3654 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3655 if not sql_query_chromosomes_df["count"][0]: 3656 log.info(f"VCF empty") 3657 return 3658 3659 # Export in VCF 3660 log.debug("Create initial file to annotate") 3661 tmp_vcf = NamedTemporaryFile( 3662 prefix=self.get_prefix(), 3663 dir=self.get_tmp_dir(), 3664 suffix=".vcf.gz", 3665 delete=False, 3666 ) 3667 tmp_vcf_name = tmp_vcf.name 3668 3669 # VCF header 3670 vcf_reader = self.get_header() 3671 log.debug("Initial header: " + str(vcf_reader.infos)) 3672 3673 # Existing annotations 3674 for vcf_annotation in self.get_header().infos: 3675 3676 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3677 log.debug( 3678 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3679 ) 3680 3681 if annotations: 3682 3683 tmp_ann_vcf_list = [] 3684 commands = [] 3685 tmp_files = [] 3686 err_files = [] 3687 3688 for annotation in annotations: 3689 annotation_fields = annotations[annotation] 3690 3691 # Annotation Name 3692 annotation_name = os.path.basename(annotation) 3693 3694 if not annotation_fields: 3695 annotation_fields = {"INFO": None} 3696 3697 log.debug(f"Annotation '{annotation_name}'") 3698 log.debug( 3699 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3700 ) 3701 3702 # Create Database 3703 database = Database( 3704 database=annotation, 3705 databases_folders=databases_folders, 3706 assembly=assembly, 3707 ) 3708 3709 # Find files 3710 db_file = database.get_database() 3711 db_file = full_path(db_file) 3712 db_hdr_file = database.get_header_file() 3713 db_hdr_file = full_path(db_hdr_file) 3714 db_file_type = database.get_format() 3715 db_tbi_file = f"{db_file}.tbi" 3716 db_file_compressed = database.is_compressed() 3717 3718 # Check if compressed 3719 if not db_file_compressed: 3720 log.error( 3721 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3722 ) 3723 raise ValueError( 3724 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3725 ) 3726 3727 # Check if indexed 3728 if not os.path.exists(db_tbi_file): 3729 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3730 raise ValueError( 3731 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3732 ) 3733 3734 # Check index - try to create if not exists 3735 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3736 log.error("Annotation failed: database not valid") 3737 log.error(f"Annotation annotation file: {db_file}") 3738 log.error(f"Annotation annotation header: {db_hdr_file}") 3739 log.error(f"Annotation annotation index: {db_tbi_file}") 3740 raise ValueError( 3741 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3742 ) 3743 else: 3744 3745 log.debug( 3746 f"Annotation '{annotation}' - file: " 3747 + str(db_file) 3748 + " and " 3749 + str(db_hdr_file) 3750 ) 3751 3752 # Load header as VCF object 3753 db_hdr_vcf = Variants(input=db_hdr_file) 3754 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3755 log.debug( 3756 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3757 ) 3758 3759 # For all fields in database 3760 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3761 annotation_fields = { 3762 key: key for key in db_hdr_vcf_header_infos 3763 } 3764 log.debug( 3765 "Annotation database header - All annotations added: " 3766 + str(annotation_fields) 3767 ) 3768 3769 # Number of fields 3770 nb_annotation_field = 0 3771 annotation_list = [] 3772 3773 for annotation_field in annotation_fields: 3774 3775 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3776 annotation_fields_new_name = annotation_fields.get( 3777 annotation_field, annotation_field 3778 ) 3779 if not annotation_fields_new_name: 3780 annotation_fields_new_name = annotation_field 3781 3782 # Check if field is in DB and if field is not elready in input data 3783 if ( 3784 annotation_field in db_hdr_vcf.get_header().infos 3785 and annotation_fields_new_name 3786 not in self.get_header().infos 3787 ): 3788 3789 log.info( 3790 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3791 ) 3792 3793 # Add INFO field to header 3794 db_hdr_vcf_header_infos_number = ( 3795 db_hdr_vcf_header_infos[annotation_field].num or "." 3796 ) 3797 db_hdr_vcf_header_infos_type = ( 3798 db_hdr_vcf_header_infos[annotation_field].type 3799 or "String" 3800 ) 3801 db_hdr_vcf_header_infos_description = ( 3802 db_hdr_vcf_header_infos[annotation_field].desc 3803 or f"{annotation_field} description" 3804 ) 3805 db_hdr_vcf_header_infos_source = ( 3806 db_hdr_vcf_header_infos[annotation_field].source 3807 or "unknown" 3808 ) 3809 db_hdr_vcf_header_infos_version = ( 3810 db_hdr_vcf_header_infos[annotation_field].version 3811 or "unknown" 3812 ) 3813 3814 vcf_reader.infos[annotation_fields_new_name] = ( 3815 vcf.parser._Info( 3816 annotation_fields_new_name, 3817 db_hdr_vcf_header_infos_number, 3818 db_hdr_vcf_header_infos_type, 3819 db_hdr_vcf_header_infos_description, 3820 db_hdr_vcf_header_infos_source, 3821 db_hdr_vcf_header_infos_version, 3822 self.code_type_map[db_hdr_vcf_header_infos_type], 3823 ) 3824 ) 3825 3826 # annotation_list.append(annotation_field) 3827 if annotation_field != annotation_fields_new_name: 3828 annotation_list.append( 3829 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3830 ) 3831 else: 3832 annotation_list.append(annotation_field) 3833 3834 nb_annotation_field += 1 3835 3836 else: 3837 3838 if annotation_field not in db_hdr_vcf.get_header().infos: 3839 log.warning( 3840 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3841 ) 3842 if annotation_fields_new_name in self.get_header().infos: 3843 log.warning( 3844 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3845 ) 3846 3847 log.info( 3848 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3849 ) 3850 3851 annotation_infos = ",".join(annotation_list) 3852 3853 if annotation_infos != "": 3854 3855 # Protect header for bcftools (remove "#CHROM" and variants line) 3856 log.debug("Protect Header file - remove #CHROM line if exists") 3857 tmp_header_vcf = NamedTemporaryFile( 3858 prefix=self.get_prefix(), 3859 dir=self.get_tmp_dir(), 3860 suffix=".hdr", 3861 delete=False, 3862 ) 3863 tmp_header_vcf_name = tmp_header_vcf.name 3864 tmp_files.append(tmp_header_vcf_name) 3865 # Command 3866 if db_hdr_file.endswith(".gz"): 3867 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3868 else: 3869 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3870 # Run 3871 run_parallel_commands([command_extract_header], 1) 3872 3873 # Find chomosomes 3874 log.debug("Find chromosomes ") 3875 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3876 sql_query_chromosomes_df = self.get_query_to_df( 3877 sql_query_chromosomes 3878 ) 3879 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3880 3881 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3882 3883 # BED columns in the annotation file 3884 if db_file_type in ["bed"]: 3885 annotation_infos = "CHROM,POS,POS," + annotation_infos 3886 3887 for chrom in chomosomes_list: 3888 3889 # Create BED on initial VCF 3890 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3891 tmp_bed = NamedTemporaryFile( 3892 prefix=self.get_prefix(), 3893 dir=self.get_tmp_dir(), 3894 suffix=".bed", 3895 delete=False, 3896 ) 3897 tmp_bed_name = tmp_bed.name 3898 tmp_files.append(tmp_bed_name) 3899 3900 # Detecte regions 3901 log.debug( 3902 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3903 ) 3904 window = 1000000 3905 sql_query_intervals_for_bed = f""" 3906 SELECT \"#CHROM\", 3907 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3908 \"POS\"+{window} 3909 FROM {table_variants} as table_variants 3910 WHERE table_variants.\"#CHROM\" = '{chrom}' 3911 """ 3912 regions = self.conn.execute( 3913 sql_query_intervals_for_bed 3914 ).fetchall() 3915 merged_regions = merge_regions(regions) 3916 log.debug( 3917 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3918 ) 3919 3920 header = ["#CHROM", "START", "END"] 3921 with open(tmp_bed_name, "w") as f: 3922 # Write the header with tab delimiter 3923 f.write("\t".join(header) + "\n") 3924 for d in merged_regions: 3925 # Write each data row with tab delimiter 3926 f.write("\t".join(map(str, d)) + "\n") 3927 3928 # Tmp files 3929 tmp_annotation_vcf = NamedTemporaryFile( 3930 prefix=self.get_prefix(), 3931 dir=self.get_tmp_dir(), 3932 suffix=".vcf.gz", 3933 delete=False, 3934 ) 3935 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3936 tmp_files.append(tmp_annotation_vcf_name) 3937 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3938 tmp_annotation_vcf_name_err = ( 3939 tmp_annotation_vcf_name + ".err" 3940 ) 3941 err_files.append(tmp_annotation_vcf_name_err) 3942 3943 # Annotate Command 3944 log.debug( 3945 f"Annotation '{annotation}' - add bcftools command" 3946 ) 3947 3948 # Command 3949 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3950 3951 # Add command 3952 commands.append(command_annotate) 3953 3954 # if some commands 3955 if commands: 3956 3957 # Export VCF file 3958 self.export_variant_vcf( 3959 vcf_file=tmp_vcf_name, 3960 remove_info=True, 3961 add_samples=False, 3962 index=True, 3963 ) 3964 3965 # Threads 3966 # calculate threads for annotated commands 3967 if commands: 3968 threads_bcftools_annotate = round(threads / len(commands)) 3969 else: 3970 threads_bcftools_annotate = 1 3971 3972 if not threads_bcftools_annotate: 3973 threads_bcftools_annotate = 1 3974 3975 # Add threads option to bcftools commands 3976 if threads_bcftools_annotate > 1: 3977 commands_threaded = [] 3978 for command in commands: 3979 commands_threaded.append( 3980 command.replace( 3981 f"{bcftools_bin_command} annotate ", 3982 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3983 ) 3984 ) 3985 commands = commands_threaded 3986 3987 # Command annotation multithreading 3988 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3989 log.info( 3990 f"Annotation - Annotation multithreaded in " 3991 + str(len(commands)) 3992 + " commands" 3993 ) 3994 3995 run_parallel_commands(commands, threads) 3996 3997 # Merge 3998 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3999 4000 if tmp_ann_vcf_list_cmd: 4001 4002 # Tmp file 4003 tmp_annotate_vcf = NamedTemporaryFile( 4004 prefix=self.get_prefix(), 4005 dir=self.get_tmp_dir(), 4006 suffix=".vcf.gz", 4007 delete=True, 4008 ) 4009 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4010 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4011 err_files.append(tmp_annotate_vcf_name_err) 4012 4013 # Tmp file remove command 4014 tmp_files_remove_command = "" 4015 if tmp_files: 4016 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4017 4018 # Command merge 4019 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4020 log.info( 4021 f"Annotation - Annotation merging " 4022 + str(len(commands)) 4023 + " annotated files" 4024 ) 4025 log.debug(f"Annotation - merge command: {merge_command}") 4026 run_parallel_commands([merge_command], 1) 4027 4028 # Error messages 4029 log.info(f"Error/Warning messages:") 4030 error_message_command_all = [] 4031 error_message_command_warning = [] 4032 error_message_command_err = [] 4033 for err_file in err_files: 4034 with open(err_file, "r") as f: 4035 for line in f: 4036 message = line.strip() 4037 error_message_command_all.append(message) 4038 if line.startswith("[W::"): 4039 error_message_command_warning.append(message) 4040 if line.startswith("[E::"): 4041 error_message_command_err.append( 4042 f"{err_file}: " + message 4043 ) 4044 # log info 4045 for message in list( 4046 set(error_message_command_err + error_message_command_warning) 4047 ): 4048 log.info(f" {message}") 4049 # debug info 4050 for message in list(set(error_message_command_all)): 4051 log.debug(f" {message}") 4052 # failed 4053 if len(error_message_command_err): 4054 log.error("Annotation failed: Error in commands") 4055 raise ValueError("Annotation failed: Error in commands") 4056 4057 # Update variants 4058 log.info(f"Annotation - Updating...") 4059 self.update_from_vcf(tmp_annotate_vcf_name) 4060 4061 def annotation_exomiser(self, threads: int = None) -> None: 4062 """ 4063 This function annotate with Exomiser 4064 4065 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4066 - "analysis" (dict/file): 4067 Full analysis dictionnary parameters (see Exomiser docs). 4068 Either a dict, or a file in JSON or YAML format. 4069 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4070 Default : None 4071 - "preset" (string): 4072 Analysis preset (available in config folder). 4073 Used if no full "analysis" is provided. 4074 Default: "exome" 4075 - "phenopacket" (dict/file): 4076 Samples and phenotipic features parameters (see Exomiser docs). 4077 Either a dict, or a file in JSON or YAML format. 4078 Default: None 4079 - "subject" (dict): 4080 Sample parameters (see Exomiser docs). 4081 Example: 4082 "subject": 4083 { 4084 "id": "ISDBM322017", 4085 "sex": "FEMALE" 4086 } 4087 Default: None 4088 - "sample" (string): 4089 Sample name to construct "subject" section: 4090 "subject": 4091 { 4092 "id": "<sample>", 4093 "sex": "UNKNOWN_SEX" 4094 } 4095 Default: None 4096 - "phenotypicFeatures" (dict) 4097 Phenotypic features to construct "subject" section. 4098 Example: 4099 "phenotypicFeatures": 4100 [ 4101 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4102 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4103 ] 4104 - "hpo" (list) 4105 List of HPO ids as phenotypic features. 4106 Example: 4107 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4108 Default: [] 4109 - "outputOptions" (dict): 4110 Output options (see Exomiser docs). 4111 Default: 4112 "output_options" = 4113 { 4114 "outputContributingVariantsOnly": False, 4115 "numGenes": 0, 4116 "outputFormats": ["TSV_VARIANT", "VCF"] 4117 } 4118 - "transcript_source" (string): 4119 Transcript source (either "refseq", "ucsc", "ensembl") 4120 Default: "refseq" 4121 - "exomiser_to_info" (boolean): 4122 Add exomiser TSV file columns as INFO fields in VCF. 4123 Default: False 4124 - "release" (string): 4125 Exomise database release. 4126 If not exists, database release will be downloaded (take a while). 4127 Default: None (provided by application.properties configuration file) 4128 - "exomiser_application_properties" (file): 4129 Exomiser configuration file (see Exomiser docs). 4130 Useful to automatically download databases (especially for specific genome databases). 4131 4132 Notes: 4133 - If no sample in parameters, first sample in VCF will be chosen 4134 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4135 4136 :param threads: The number of threads to use 4137 :return: None. 4138 """ 4139 4140 # DEBUG 4141 log.debug("Start annotation with Exomiser databases") 4142 4143 # Threads 4144 if not threads: 4145 threads = self.get_threads() 4146 log.debug("Threads: " + str(threads)) 4147 4148 # Config 4149 config = self.get_config() 4150 log.debug("Config: " + str(config)) 4151 4152 # Config - Folders - Databases 4153 databases_folders = ( 4154 config.get("folders", {}) 4155 .get("databases", {}) 4156 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4157 ) 4158 databases_folders = full_path(databases_folders) 4159 if not os.path.exists(databases_folders): 4160 log.error(f"Databases annotations: {databases_folders} NOT found") 4161 log.debug("Databases annotations: " + str(databases_folders)) 4162 4163 # Config - Exomiser 4164 exomiser_bin_command = get_bin_command( 4165 bin="exomiser-cli*.jar", 4166 tool="exomiser", 4167 bin_type="jar", 4168 config=config, 4169 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4170 ) 4171 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4172 if not exomiser_bin_command: 4173 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4174 log.error(msg_err) 4175 raise ValueError(msg_err) 4176 4177 # Param 4178 param = self.get_param() 4179 log.debug("Param: " + str(param)) 4180 4181 # Param - Exomiser 4182 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4183 log.debug(f"Param Exomiser: {param_exomiser}") 4184 4185 # Param - Assembly 4186 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4187 log.debug("Assembly: " + str(assembly)) 4188 4189 # Data 4190 table_variants = self.get_table_variants() 4191 4192 # Check if not empty 4193 log.debug("Check if not empty") 4194 sql_query_chromosomes = ( 4195 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4196 ) 4197 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4198 log.info(f"VCF empty") 4199 return False 4200 4201 # VCF header 4202 vcf_reader = self.get_header() 4203 log.debug("Initial header: " + str(vcf_reader.infos)) 4204 4205 # Samples 4206 samples = self.get_header_sample_list() 4207 if not samples: 4208 log.error("No Samples in VCF") 4209 return False 4210 log.debug(f"Samples: {samples}") 4211 4212 # Memory limit 4213 memory_limit = self.get_memory("8G") 4214 log.debug(f"memory_limit: {memory_limit}") 4215 4216 # Exomiser java options 4217 exomiser_java_options = ( 4218 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4219 ) 4220 log.debug(f"Exomiser java options: {exomiser_java_options}") 4221 4222 # Download Exomiser (if not exists) 4223 exomiser_release = param_exomiser.get("release", None) 4224 exomiser_application_properties = param_exomiser.get( 4225 "exomiser_application_properties", None 4226 ) 4227 databases_download_exomiser( 4228 assemblies=[assembly], 4229 exomiser_folder=databases_folders, 4230 exomiser_release=exomiser_release, 4231 exomiser_phenotype_release=exomiser_release, 4232 exomiser_application_properties=exomiser_application_properties, 4233 ) 4234 4235 # Force annotation 4236 force_update_annotation = True 4237 4238 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4239 log.debug("Start annotation Exomiser") 4240 4241 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4242 4243 # tmp_dir = "/tmp/exomiser" 4244 4245 ### ANALYSIS ### 4246 ################ 4247 4248 # Create analysis.json through analysis dict 4249 # either analysis in param or by default 4250 # depending on preset exome/genome) 4251 4252 # Init analysis dict 4253 param_exomiser_analysis_dict = {} 4254 4255 # analysis from param 4256 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4257 param_exomiser_analysis = full_path(param_exomiser_analysis) 4258 4259 # If analysis in param -> load anlaysis json 4260 if param_exomiser_analysis: 4261 4262 # If param analysis is a file and exists 4263 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4264 param_exomiser_analysis 4265 ): 4266 # Load analysis file into analysis dict (either yaml or json) 4267 with open(param_exomiser_analysis) as json_file: 4268 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4269 4270 # If param analysis is a dict 4271 elif isinstance(param_exomiser_analysis, dict): 4272 # Load analysis dict into analysis dict (either yaml or json) 4273 param_exomiser_analysis_dict = param_exomiser_analysis 4274 4275 # Error analysis type 4276 else: 4277 log.error(f"Analysis type unknown. Check param file.") 4278 raise ValueError(f"Analysis type unknown. Check param file.") 4279 4280 # Case no input analysis config file/dict 4281 # Use preset (exome/genome) to open default config file 4282 if not param_exomiser_analysis_dict: 4283 4284 # default preset 4285 default_preset = "exome" 4286 4287 # Get param preset or default preset 4288 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4289 4290 # Try to find if preset is a file 4291 if os.path.exists(param_exomiser_preset): 4292 # Preset file is provided in full path 4293 param_exomiser_analysis_default_config_file = ( 4294 param_exomiser_preset 4295 ) 4296 # elif os.path.exists(full_path(param_exomiser_preset)): 4297 # # Preset file is provided in full path 4298 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4299 elif os.path.exists( 4300 os.path.join(folder_config, param_exomiser_preset) 4301 ): 4302 # Preset file is provided a basename in config folder (can be a path with subfolders) 4303 param_exomiser_analysis_default_config_file = os.path.join( 4304 folder_config, param_exomiser_preset 4305 ) 4306 else: 4307 # Construct preset file 4308 param_exomiser_analysis_default_config_file = os.path.join( 4309 folder_config, 4310 f"preset-{param_exomiser_preset}-analysis.json", 4311 ) 4312 4313 # If preset file exists 4314 param_exomiser_analysis_default_config_file = full_path( 4315 param_exomiser_analysis_default_config_file 4316 ) 4317 if os.path.exists(param_exomiser_analysis_default_config_file): 4318 # Load prest file into analysis dict (either yaml or json) 4319 with open( 4320 param_exomiser_analysis_default_config_file 4321 ) as json_file: 4322 # param_exomiser_analysis_dict[""] = json.load(json_file) 4323 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4324 json_file 4325 ) 4326 4327 # Error preset file 4328 else: 4329 log.error( 4330 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4331 ) 4332 raise ValueError( 4333 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4334 ) 4335 4336 # If no analysis dict created 4337 if not param_exomiser_analysis_dict: 4338 log.error(f"No analysis config") 4339 raise ValueError(f"No analysis config") 4340 4341 # Log 4342 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4343 4344 ### PHENOPACKET ### 4345 ################### 4346 4347 # If no PhenoPacket in analysis dict -> check in param 4348 if "phenopacket" not in param_exomiser_analysis_dict: 4349 4350 # If PhenoPacket in param -> load anlaysis json 4351 if param_exomiser.get("phenopacket", None): 4352 4353 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4354 param_exomiser_phenopacket = full_path( 4355 param_exomiser_phenopacket 4356 ) 4357 4358 # If param phenopacket is a file and exists 4359 if isinstance( 4360 param_exomiser_phenopacket, str 4361 ) and os.path.exists(param_exomiser_phenopacket): 4362 # Load phenopacket file into analysis dict (either yaml or json) 4363 with open(param_exomiser_phenopacket) as json_file: 4364 param_exomiser_analysis_dict["phenopacket"] = ( 4365 yaml.safe_load(json_file) 4366 ) 4367 4368 # If param phenopacket is a dict 4369 elif isinstance(param_exomiser_phenopacket, dict): 4370 # Load phenopacket dict into analysis dict (either yaml or json) 4371 param_exomiser_analysis_dict["phenopacket"] = ( 4372 param_exomiser_phenopacket 4373 ) 4374 4375 # Error phenopacket type 4376 else: 4377 log.error(f"Phenopacket type unknown. Check param file.") 4378 raise ValueError( 4379 f"Phenopacket type unknown. Check param file." 4380 ) 4381 4382 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4383 if "phenopacket" not in param_exomiser_analysis_dict: 4384 4385 # Init PhenoPacket 4386 param_exomiser_analysis_dict["phenopacket"] = { 4387 "id": "analysis", 4388 "proband": {}, 4389 } 4390 4391 ### Add subject ### 4392 4393 # If subject exists 4394 param_exomiser_subject = param_exomiser.get("subject", {}) 4395 4396 # If subject not exists -> found sample ID 4397 if not param_exomiser_subject: 4398 4399 # Found sample ID in param 4400 sample = param_exomiser.get("sample", None) 4401 4402 # Find sample ID (first sample) 4403 if not sample: 4404 sample_list = self.get_header_sample_list() 4405 if len(sample_list) > 0: 4406 sample = sample_list[0] 4407 else: 4408 log.error(f"No sample found") 4409 raise ValueError(f"No sample found") 4410 4411 # Create subject 4412 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4413 4414 # Add to dict 4415 param_exomiser_analysis_dict["phenopacket"][ 4416 "subject" 4417 ] = param_exomiser_subject 4418 4419 ### Add "phenotypicFeatures" ### 4420 4421 # If phenotypicFeatures exists 4422 param_exomiser_phenotypicfeatures = param_exomiser.get( 4423 "phenotypicFeatures", [] 4424 ) 4425 4426 # If phenotypicFeatures not exists -> Try to infer from hpo list 4427 if not param_exomiser_phenotypicfeatures: 4428 4429 # Found HPO in param 4430 param_exomiser_hpo = param_exomiser.get("hpo", []) 4431 4432 # Split HPO if list in string format separated by comma 4433 if isinstance(param_exomiser_hpo, str): 4434 param_exomiser_hpo = param_exomiser_hpo.split(",") 4435 4436 # Create HPO list 4437 for hpo in param_exomiser_hpo: 4438 hpo_clean = re.sub("[^0-9]", "", hpo) 4439 param_exomiser_phenotypicfeatures.append( 4440 { 4441 "type": { 4442 "id": f"HP:{hpo_clean}", 4443 "label": f"HP:{hpo_clean}", 4444 } 4445 } 4446 ) 4447 4448 # Add to dict 4449 param_exomiser_analysis_dict["phenopacket"][ 4450 "phenotypicFeatures" 4451 ] = param_exomiser_phenotypicfeatures 4452 4453 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4454 if not param_exomiser_phenotypicfeatures: 4455 for step in param_exomiser_analysis_dict.get( 4456 "analysis", {} 4457 ).get("steps", []): 4458 if "hiPhivePrioritiser" in step: 4459 param_exomiser_analysis_dict.get("analysis", {}).get( 4460 "steps", [] 4461 ).remove(step) 4462 4463 ### Add Input File ### 4464 4465 # Initial file name and htsFiles 4466 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4467 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4468 { 4469 "uri": tmp_vcf_name, 4470 "htsFormat": "VCF", 4471 "genomeAssembly": assembly, 4472 } 4473 ] 4474 4475 ### Add metaData ### 4476 4477 # If metaData not in analysis dict 4478 if "metaData" not in param_exomiser_analysis_dict: 4479 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4480 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4481 "createdBy": "howard", 4482 "phenopacketSchemaVersion": 1, 4483 } 4484 4485 ### OutputOptions ### 4486 4487 # Init output result folder 4488 output_results = os.path.join(tmp_dir, "results") 4489 4490 # If no outputOptions in analysis dict 4491 if "outputOptions" not in param_exomiser_analysis_dict: 4492 4493 # default output formats 4494 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4495 4496 # Get outputOptions in param 4497 output_options = param_exomiser.get("outputOptions", None) 4498 4499 # If no output_options in param -> check 4500 if not output_options: 4501 output_options = { 4502 "outputContributingVariantsOnly": False, 4503 "numGenes": 0, 4504 "outputFormats": defaut_output_formats, 4505 } 4506 4507 # Replace outputDirectory in output options 4508 output_options["outputDirectory"] = output_results 4509 output_options["outputFileName"] = "howard" 4510 4511 # Add outputOptions in analysis dict 4512 param_exomiser_analysis_dict["outputOptions"] = output_options 4513 4514 else: 4515 4516 # Replace output_results and output format (if exists in param) 4517 param_exomiser_analysis_dict["outputOptions"][ 4518 "outputDirectory" 4519 ] = output_results 4520 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4521 list( 4522 set( 4523 param_exomiser_analysis_dict.get( 4524 "outputOptions", {} 4525 ).get("outputFormats", []) 4526 + ["TSV_VARIANT", "VCF"] 4527 ) 4528 ) 4529 ) 4530 4531 # log 4532 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4533 4534 ### ANALYSIS FILE ### 4535 ##################### 4536 4537 ### Full JSON analysis config file ### 4538 4539 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4540 with open(exomiser_analysis, "w") as fp: 4541 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4542 4543 ### SPLIT analysis and sample config files 4544 4545 # Splitted analysis dict 4546 param_exomiser_analysis_dict_for_split = ( 4547 param_exomiser_analysis_dict.copy() 4548 ) 4549 4550 # Phenopacket JSON file 4551 exomiser_analysis_phenopacket = os.path.join( 4552 tmp_dir, "analysis_phenopacket.json" 4553 ) 4554 with open(exomiser_analysis_phenopacket, "w") as fp: 4555 json.dump( 4556 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4557 fp, 4558 indent=4, 4559 ) 4560 4561 # Analysis JSON file without Phenopacket parameters 4562 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4563 exomiser_analysis_analysis = os.path.join( 4564 tmp_dir, "analysis_analysis.json" 4565 ) 4566 with open(exomiser_analysis_analysis, "w") as fp: 4567 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4568 4569 ### INITAL VCF file ### 4570 ####################### 4571 4572 ### Create list of samples to use and include inti initial VCF file #### 4573 4574 # Subject (main sample) 4575 # Get sample ID in analysis dict 4576 sample_subject = ( 4577 param_exomiser_analysis_dict.get("phenopacket", {}) 4578 .get("subject", {}) 4579 .get("id", None) 4580 ) 4581 sample_proband = ( 4582 param_exomiser_analysis_dict.get("phenopacket", {}) 4583 .get("proband", {}) 4584 .get("subject", {}) 4585 .get("id", None) 4586 ) 4587 sample = [] 4588 if sample_subject: 4589 sample.append(sample_subject) 4590 if sample_proband: 4591 sample.append(sample_proband) 4592 4593 # Get sample ID within Pedigree 4594 pedigree_persons_list = ( 4595 param_exomiser_analysis_dict.get("phenopacket", {}) 4596 .get("pedigree", {}) 4597 .get("persons", {}) 4598 ) 4599 4600 # Create list with all sample ID in pedigree (if exists) 4601 pedigree_persons = [] 4602 for person in pedigree_persons_list: 4603 pedigree_persons.append(person.get("individualId")) 4604 4605 # Concat subject sample ID and samples ID in pedigreesamples 4606 samples = list(set(sample + pedigree_persons)) 4607 4608 # Check if sample list is not empty 4609 if not samples: 4610 log.error(f"No samples found") 4611 raise ValueError(f"No samples found") 4612 4613 # Create VCF with sample (either sample in param or first one by default) 4614 # Export VCF file 4615 self.export_variant_vcf( 4616 vcf_file=tmp_vcf_name, 4617 remove_info=True, 4618 add_samples=True, 4619 list_samples=samples, 4620 index=False, 4621 ) 4622 4623 ### Execute Exomiser ### 4624 ######################## 4625 4626 # Init command 4627 exomiser_command = "" 4628 4629 # Command exomiser options 4630 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4631 4632 # Release 4633 exomiser_release = param_exomiser.get("release", None) 4634 if exomiser_release: 4635 # phenotype data version 4636 exomiser_options += ( 4637 f" --exomiser.phenotype.data-version={exomiser_release} " 4638 ) 4639 # data version 4640 exomiser_options += ( 4641 f" --exomiser.{assembly}.data-version={exomiser_release} " 4642 ) 4643 # variant white list 4644 variant_white_list_file = ( 4645 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4646 ) 4647 if os.path.exists( 4648 os.path.join( 4649 databases_folders, assembly, variant_white_list_file 4650 ) 4651 ): 4652 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4653 4654 # transcript_source 4655 transcript_source = param_exomiser.get( 4656 "transcript_source", None 4657 ) # ucsc, refseq, ensembl 4658 if transcript_source: 4659 exomiser_options += ( 4660 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4661 ) 4662 4663 # If analysis contain proband param 4664 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4665 "proband", {} 4666 ): 4667 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4668 4669 # If no proband (usually uniq sample) 4670 else: 4671 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4672 4673 # Log 4674 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4675 4676 # Run command 4677 result = subprocess.call( 4678 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4679 ) 4680 if result: 4681 log.error("Exomiser command failed") 4682 raise ValueError("Exomiser command failed") 4683 4684 ### RESULTS ### 4685 ############### 4686 4687 ### Annotate with TSV fields ### 4688 4689 # Init result tsv file 4690 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4691 4692 # Init result tsv file 4693 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4694 4695 # Parse TSV file and explode columns in INFO field 4696 if exomiser_to_info and os.path.exists(output_results_tsv): 4697 4698 # Log 4699 log.debug("Exomiser columns to VCF INFO field") 4700 4701 # Retrieve columns and types 4702 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4703 output_results_tsv_df = self.get_query_to_df(query) 4704 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4705 4706 # Init concat fields for update 4707 sql_query_update_concat_fields = [] 4708 4709 # Fields to avoid 4710 fields_to_avoid = [ 4711 "CONTIG", 4712 "START", 4713 "END", 4714 "REF", 4715 "ALT", 4716 "QUAL", 4717 "FILTER", 4718 "GENOTYPE", 4719 ] 4720 4721 # List all columns to add into header 4722 for header_column in output_results_tsv_columns: 4723 4724 # If header column is enable 4725 if header_column not in fields_to_avoid: 4726 4727 # Header info type 4728 header_info_type = "String" 4729 header_column_df = output_results_tsv_df[header_column] 4730 header_column_df_dtype = header_column_df.dtype 4731 if header_column_df_dtype == object: 4732 if ( 4733 pd.to_numeric(header_column_df, errors="coerce") 4734 .notnull() 4735 .all() 4736 ): 4737 header_info_type = "Float" 4738 else: 4739 header_info_type = "Integer" 4740 4741 # Header info 4742 characters_to_validate = ["-"] 4743 pattern = "[" + "".join(characters_to_validate) + "]" 4744 header_info_name = re.sub( 4745 pattern, 4746 "_", 4747 f"Exomiser_{header_column}".replace("#", ""), 4748 ) 4749 header_info_number = "." 4750 header_info_description = ( 4751 f"Exomiser {header_column} annotation" 4752 ) 4753 header_info_source = "Exomiser" 4754 header_info_version = "unknown" 4755 header_info_code = CODE_TYPE_MAP[header_info_type] 4756 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4757 header_info_name, 4758 header_info_number, 4759 header_info_type, 4760 header_info_description, 4761 header_info_source, 4762 header_info_version, 4763 header_info_code, 4764 ) 4765 4766 # Add field to add for update to concat fields 4767 sql_query_update_concat_fields.append( 4768 f""" 4769 CASE 4770 WHEN table_parquet."{header_column}" NOT IN ('','.') 4771 THEN concat( 4772 '{header_info_name}=', 4773 table_parquet."{header_column}", 4774 ';' 4775 ) 4776 4777 ELSE '' 4778 END 4779 """ 4780 ) 4781 4782 # Update query 4783 sql_query_update = f""" 4784 UPDATE {table_variants} as table_variants 4785 SET INFO = concat( 4786 CASE 4787 WHEN INFO NOT IN ('', '.') 4788 THEN INFO 4789 ELSE '' 4790 END, 4791 CASE 4792 WHEN table_variants.INFO NOT IN ('','.') 4793 THEN ';' 4794 ELSE '' 4795 END, 4796 ( 4797 SELECT 4798 concat( 4799 {",".join(sql_query_update_concat_fields)} 4800 ) 4801 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4802 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4803 AND table_parquet.\"START\" = table_variants.\"POS\" 4804 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4805 AND table_parquet.\"REF\" = table_variants.\"REF\" 4806 ) 4807 ) 4808 ; 4809 """ 4810 4811 # Update 4812 self.conn.execute(sql_query_update) 4813 4814 ### Annotate with VCF INFO field ### 4815 4816 # Init result VCF file 4817 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4818 4819 # If VCF exists 4820 if os.path.exists(output_results_vcf): 4821 4822 # Log 4823 log.debug("Exomiser result VCF update variants") 4824 4825 # Find Exomiser INFO field annotation in header 4826 with gzip.open(output_results_vcf, "rt") as f: 4827 header_list = self.read_vcf_header(f) 4828 exomiser_vcf_header = vcf.Reader( 4829 io.StringIO("\n".join(header_list)) 4830 ) 4831 4832 # Add annotation INFO field to header 4833 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4834 4835 # Update variants with VCF 4836 self.update_from_vcf(output_results_vcf) 4837 4838 return True 4839 4840 def annotation_snpeff(self, threads: int = None) -> None: 4841 """ 4842 This function annotate with snpEff 4843 4844 :param threads: The number of threads to use 4845 :return: the value of the variable "return_value". 4846 """ 4847 4848 # DEBUG 4849 log.debug("Start annotation with snpeff databases") 4850 4851 # Threads 4852 if not threads: 4853 threads = self.get_threads() 4854 log.debug("Threads: " + str(threads)) 4855 4856 # DEBUG 4857 delete_tmp = True 4858 if self.get_config().get("verbosity", "warning") in ["debug"]: 4859 delete_tmp = False 4860 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4861 4862 # Config 4863 config = self.get_config() 4864 log.debug("Config: " + str(config)) 4865 4866 # Config - Folders - Databases 4867 databases_folders = ( 4868 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4869 ) 4870 log.debug("Databases annotations: " + str(databases_folders)) 4871 4872 # # Config - Java 4873 # java_bin = get_bin( 4874 # tool="java", 4875 # bin="java", 4876 # bin_type="bin", 4877 # config=config, 4878 # default_folder="/usr/bin", 4879 # ) 4880 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4881 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4882 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4883 4884 # # Config - snpEff bin 4885 # snpeff_jar = get_bin( 4886 # tool="snpeff", 4887 # bin="snpEff.jar", 4888 # bin_type="jar", 4889 # config=config, 4890 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4891 # ) 4892 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4893 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4894 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4895 4896 # Config - snpEff bin command 4897 snpeff_bin_command = get_bin_command( 4898 bin="snpEff.jar", 4899 tool="snpeff", 4900 bin_type="jar", 4901 config=config, 4902 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4903 ) 4904 if not snpeff_bin_command: 4905 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4906 log.error(msg_err) 4907 raise ValueError(msg_err) 4908 4909 # Config - snpEff databases 4910 snpeff_databases = ( 4911 config.get("folders", {}) 4912 .get("databases", {}) 4913 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4914 ) 4915 snpeff_databases = full_path(snpeff_databases) 4916 if snpeff_databases is not None and snpeff_databases != "": 4917 log.debug(f"Create snpEff databases folder") 4918 if not os.path.exists(snpeff_databases): 4919 os.makedirs(snpeff_databases) 4920 4921 # Param 4922 param = self.get_param() 4923 log.debug("Param: " + str(param)) 4924 4925 # Param 4926 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4927 log.debug("Options: " + str(options)) 4928 4929 # Param - Assembly 4930 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4931 4932 # Param - Options 4933 snpeff_options = ( 4934 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4935 ) 4936 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4937 snpeff_csvstats = ( 4938 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4939 ) 4940 if snpeff_stats: 4941 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4942 snpeff_stats = full_path(snpeff_stats) 4943 snpeff_options += f" -stats {snpeff_stats}" 4944 if snpeff_csvstats: 4945 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4946 snpeff_csvstats = full_path(snpeff_csvstats) 4947 snpeff_options += f" -csvStats {snpeff_csvstats}" 4948 4949 # Data 4950 table_variants = self.get_table_variants() 4951 4952 # Check if not empty 4953 log.debug("Check if not empty") 4954 sql_query_chromosomes = ( 4955 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4956 ) 4957 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4958 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4959 log.info(f"VCF empty") 4960 return 4961 4962 # Export in VCF 4963 log.debug("Create initial file to annotate") 4964 tmp_vcf = NamedTemporaryFile( 4965 prefix=self.get_prefix(), 4966 dir=self.get_tmp_dir(), 4967 suffix=".vcf.gz", 4968 delete=True, 4969 ) 4970 tmp_vcf_name = tmp_vcf.name 4971 4972 # VCF header 4973 vcf_reader = self.get_header() 4974 log.debug("Initial header: " + str(vcf_reader.infos)) 4975 4976 # Existing annotations 4977 for vcf_annotation in self.get_header().infos: 4978 4979 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4980 log.debug( 4981 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4982 ) 4983 4984 # Memory limit 4985 # if config.get("memory", None): 4986 # memory_limit = config.get("memory", "8G") 4987 # else: 4988 # memory_limit = "8G" 4989 memory_limit = self.get_memory("8G") 4990 log.debug(f"memory_limit: {memory_limit}") 4991 4992 # snpEff java options 4993 snpeff_java_options = ( 4994 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4995 ) 4996 log.debug(f"Exomiser java options: {snpeff_java_options}") 4997 4998 force_update_annotation = True 4999 5000 if "ANN" not in self.get_header().infos or force_update_annotation: 5001 5002 # Check snpEff database 5003 log.debug(f"Check snpEff databases {[assembly]}") 5004 databases_download_snpeff( 5005 folder=snpeff_databases, assemblies=[assembly], config=config 5006 ) 5007 5008 # Export VCF file 5009 self.export_variant_vcf( 5010 vcf_file=tmp_vcf_name, 5011 remove_info=True, 5012 add_samples=False, 5013 index=True, 5014 ) 5015 5016 # Tmp file 5017 err_files = [] 5018 tmp_annotate_vcf = NamedTemporaryFile( 5019 prefix=self.get_prefix(), 5020 dir=self.get_tmp_dir(), 5021 suffix=".vcf", 5022 delete=False, 5023 ) 5024 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5025 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5026 err_files.append(tmp_annotate_vcf_name_err) 5027 5028 # Command 5029 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5030 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5031 run_parallel_commands([snpeff_command], 1) 5032 5033 # Error messages 5034 log.info(f"Error/Warning messages:") 5035 error_message_command_all = [] 5036 error_message_command_warning = [] 5037 error_message_command_err = [] 5038 for err_file in err_files: 5039 with open(err_file, "r") as f: 5040 for line in f: 5041 message = line.strip() 5042 error_message_command_all.append(message) 5043 if line.startswith("[W::"): 5044 error_message_command_warning.append(message) 5045 if line.startswith("[E::"): 5046 error_message_command_err.append(f"{err_file}: " + message) 5047 # log info 5048 for message in list( 5049 set(error_message_command_err + error_message_command_warning) 5050 ): 5051 log.info(f" {message}") 5052 # debug info 5053 for message in list(set(error_message_command_all)): 5054 log.debug(f" {message}") 5055 # failed 5056 if len(error_message_command_err): 5057 log.error("Annotation failed: Error in commands") 5058 raise ValueError("Annotation failed: Error in commands") 5059 5060 # Find annotation in header 5061 with open(tmp_annotate_vcf_name, "rt") as f: 5062 header_list = self.read_vcf_header(f) 5063 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5064 5065 for ann in annovar_vcf_header.infos: 5066 if ann not in self.get_header().infos: 5067 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5068 5069 # Update variants 5070 log.info(f"Annotation - Updating...") 5071 self.update_from_vcf(tmp_annotate_vcf_name) 5072 5073 else: 5074 if "ANN" in self.get_header().infos: 5075 log.debug(f"Existing snpEff annotations in VCF") 5076 if force_update_annotation: 5077 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5078 5079 def annotation_annovar(self, threads: int = None) -> None: 5080 """ 5081 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5082 annotations 5083 5084 :param threads: number of threads to use 5085 :return: the value of the variable "return_value". 5086 """ 5087 5088 # DEBUG 5089 log.debug("Start annotation with Annovar databases") 5090 5091 # Threads 5092 if not threads: 5093 threads = self.get_threads() 5094 log.debug("Threads: " + str(threads)) 5095 5096 # Tmp en Err files 5097 tmp_files = [] 5098 err_files = [] 5099 5100 # DEBUG 5101 delete_tmp = True 5102 if self.get_config().get("verbosity", "warning") in ["debug"]: 5103 delete_tmp = False 5104 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5105 5106 # Config 5107 config = self.get_config() 5108 log.debug("Config: " + str(config)) 5109 5110 # Config - Folders - Databases 5111 databases_folders = ( 5112 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5113 ) 5114 log.debug("Databases annotations: " + str(databases_folders)) 5115 5116 # Config - annovar bin command 5117 annovar_bin_command = get_bin_command( 5118 bin="table_annovar.pl", 5119 tool="annovar", 5120 bin_type="perl", 5121 config=config, 5122 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5123 ) 5124 if not annovar_bin_command: 5125 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5126 log.error(msg_err) 5127 raise ValueError(msg_err) 5128 5129 # Config - BCFTools bin command 5130 bcftools_bin_command = get_bin_command( 5131 bin="bcftools", 5132 tool="bcftools", 5133 bin_type="bin", 5134 config=config, 5135 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5136 ) 5137 if not bcftools_bin_command: 5138 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5139 log.error(msg_err) 5140 raise ValueError(msg_err) 5141 5142 # Config - annovar databases 5143 annovar_databases = ( 5144 config.get("folders", {}) 5145 .get("databases", {}) 5146 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5147 ) 5148 annovar_databases = full_path(annovar_databases) 5149 if annovar_databases != "" and not os.path.exists(annovar_databases): 5150 os.makedirs(annovar_databases) 5151 5152 # Param 5153 param = self.get_param() 5154 log.debug("Param: " + str(param)) 5155 5156 # Param - options 5157 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5158 log.debug("Options: " + str(options)) 5159 5160 # Param - annotations 5161 annotations = ( 5162 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5163 ) 5164 log.debug("Annotations: " + str(annotations)) 5165 5166 # Param - Assembly 5167 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5168 5169 # Annovar database assembly 5170 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5171 if annovar_databases_assembly != "" and not os.path.exists( 5172 annovar_databases_assembly 5173 ): 5174 os.makedirs(annovar_databases_assembly) 5175 5176 # Data 5177 table_variants = self.get_table_variants() 5178 5179 # Check if not empty 5180 log.debug("Check if not empty") 5181 sql_query_chromosomes = ( 5182 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5183 ) 5184 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5185 if not sql_query_chromosomes_df["count"][0]: 5186 log.info(f"VCF empty") 5187 return 5188 5189 # VCF header 5190 vcf_reader = self.get_header() 5191 log.debug("Initial header: " + str(vcf_reader.infos)) 5192 5193 # Existing annotations 5194 for vcf_annotation in self.get_header().infos: 5195 5196 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5197 log.debug( 5198 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5199 ) 5200 5201 force_update_annotation = True 5202 5203 if annotations: 5204 5205 commands = [] 5206 tmp_annotates_vcf_name_list = [] 5207 5208 # Export in VCF 5209 log.debug("Create initial file to annotate") 5210 tmp_vcf = NamedTemporaryFile( 5211 prefix=self.get_prefix(), 5212 dir=self.get_tmp_dir(), 5213 suffix=".vcf.gz", 5214 delete=False, 5215 ) 5216 tmp_vcf_name = tmp_vcf.name 5217 tmp_files.append(tmp_vcf_name) 5218 tmp_files.append(tmp_vcf_name + ".tbi") 5219 5220 # Export VCF file 5221 self.export_variant_vcf( 5222 vcf_file=tmp_vcf_name, 5223 remove_info=".", 5224 add_samples=False, 5225 index=True, 5226 ) 5227 5228 # Create file for field rename 5229 log.debug("Create file for field rename") 5230 tmp_rename = NamedTemporaryFile( 5231 prefix=self.get_prefix(), 5232 dir=self.get_tmp_dir(), 5233 suffix=".rename", 5234 delete=False, 5235 ) 5236 tmp_rename_name = tmp_rename.name 5237 tmp_files.append(tmp_rename_name) 5238 5239 # Check Annovar database 5240 log.debug( 5241 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5242 ) 5243 databases_download_annovar( 5244 folder=annovar_databases, 5245 files=list(annotations.keys()), 5246 assemblies=[assembly], 5247 ) 5248 5249 for annotation in annotations: 5250 annotation_fields = annotations[annotation] 5251 5252 if not annotation_fields: 5253 annotation_fields = {"INFO": None} 5254 5255 log.info(f"Annotations Annovar - database '{annotation}'") 5256 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5257 5258 # Tmp file for annovar 5259 err_files = [] 5260 tmp_annotate_vcf_directory = TemporaryDirectory( 5261 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5262 ) 5263 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5264 tmp_annotate_vcf_name_annovar = ( 5265 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5266 ) 5267 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5268 err_files.append(tmp_annotate_vcf_name_err) 5269 tmp_files.append(tmp_annotate_vcf_name_err) 5270 5271 # Tmp file final vcf annotated by annovar 5272 tmp_annotate_vcf = NamedTemporaryFile( 5273 prefix=self.get_prefix(), 5274 dir=self.get_tmp_dir(), 5275 suffix=".vcf.gz", 5276 delete=False, 5277 ) 5278 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5279 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5280 tmp_files.append(tmp_annotate_vcf_name) 5281 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5282 5283 # Number of fields 5284 annotation_list = [] 5285 annotation_renamed_list = [] 5286 5287 for annotation_field in annotation_fields: 5288 5289 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5290 annotation_fields_new_name = annotation_fields.get( 5291 annotation_field, annotation_field 5292 ) 5293 if not annotation_fields_new_name: 5294 annotation_fields_new_name = annotation_field 5295 5296 if ( 5297 force_update_annotation 5298 or annotation_fields_new_name not in self.get_header().infos 5299 ): 5300 annotation_list.append(annotation_field) 5301 annotation_renamed_list.append(annotation_fields_new_name) 5302 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5303 log.warning( 5304 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5305 ) 5306 5307 # Add rename info 5308 run_parallel_commands( 5309 [ 5310 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5311 ], 5312 1, 5313 ) 5314 5315 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5316 log.debug("annotation_list: " + str(annotation_list)) 5317 5318 # protocol 5319 protocol = annotation 5320 5321 # argument 5322 argument = "" 5323 5324 # operation 5325 operation = "f" 5326 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5327 "ensGene" 5328 ): 5329 operation = "g" 5330 if options.get("genebase", None): 5331 argument = f"""'{options.get("genebase","")}'""" 5332 elif annotation in ["cytoBand"]: 5333 operation = "r" 5334 5335 # argument option 5336 argument_option = "" 5337 if argument != "": 5338 argument_option = " --argument " + argument 5339 5340 # command options 5341 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5342 for option in options: 5343 if option not in ["genebase"]: 5344 command_options += f""" --{option}={options[option]}""" 5345 5346 # Command 5347 5348 # Command - Annovar 5349 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5350 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5351 5352 # Command - start pipe 5353 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5354 5355 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5356 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5357 5358 # Command - Special characters (refGene annotation) 5359 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5360 5361 # Command - Clean empty fields (with value ".") 5362 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5363 5364 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5365 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5366 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5367 # for ann in annotation_renamed_list: 5368 for ann in annotation_list: 5369 annovar_fields_to_keep.append(f"^INFO/{ann}") 5370 5371 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5372 5373 # Command - indexing 5374 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5375 5376 log.debug(f"Annotation - Annovar command: {command_annovar}") 5377 run_parallel_commands([command_annovar], 1) 5378 5379 # Error messages 5380 log.info(f"Error/Warning messages:") 5381 error_message_command_all = [] 5382 error_message_command_warning = [] 5383 error_message_command_err = [] 5384 for err_file in err_files: 5385 with open(err_file, "r") as f: 5386 for line in f: 5387 message = line.strip() 5388 error_message_command_all.append(message) 5389 if line.startswith("[W::") or line.startswith("WARNING"): 5390 error_message_command_warning.append(message) 5391 if line.startswith("[E::") or line.startswith("ERROR"): 5392 error_message_command_err.append( 5393 f"{err_file}: " + message 5394 ) 5395 # log info 5396 for message in list( 5397 set(error_message_command_err + error_message_command_warning) 5398 ): 5399 log.info(f" {message}") 5400 # debug info 5401 for message in list(set(error_message_command_all)): 5402 log.debug(f" {message}") 5403 # failed 5404 if len(error_message_command_err): 5405 log.error("Annotation failed: Error in commands") 5406 raise ValueError("Annotation failed: Error in commands") 5407 5408 if tmp_annotates_vcf_name_list: 5409 5410 # List of annotated files 5411 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5412 5413 # Tmp file 5414 tmp_annotate_vcf = NamedTemporaryFile( 5415 prefix=self.get_prefix(), 5416 dir=self.get_tmp_dir(), 5417 suffix=".vcf.gz", 5418 delete=False, 5419 ) 5420 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5421 tmp_files.append(tmp_annotate_vcf_name) 5422 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5423 err_files.append(tmp_annotate_vcf_name_err) 5424 tmp_files.append(tmp_annotate_vcf_name_err) 5425 5426 # Command merge 5427 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5428 log.info( 5429 f"Annotation Annovar - Annotation merging " 5430 + str(len(tmp_annotates_vcf_name_list)) 5431 + " annotated files" 5432 ) 5433 log.debug(f"Annotation - merge command: {merge_command}") 5434 run_parallel_commands([merge_command], 1) 5435 5436 # Find annotation in header 5437 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5438 header_list = self.read_vcf_header(f) 5439 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5440 5441 for ann in annovar_vcf_header.infos: 5442 if ann not in self.get_header().infos: 5443 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5444 5445 # Update variants 5446 log.info(f"Annotation Annovar - Updating...") 5447 self.update_from_vcf(tmp_annotate_vcf_name) 5448 5449 # Clean files 5450 # Tmp file remove command 5451 if True: 5452 tmp_files_remove_command = "" 5453 if tmp_files: 5454 tmp_files_remove_command = " ".join(tmp_files) 5455 clean_command = f" rm -f {tmp_files_remove_command} " 5456 log.debug(f"Annotation Annovar - Annotation cleaning ") 5457 log.debug(f"Annotation - cleaning command: {clean_command}") 5458 run_parallel_commands([clean_command], 1) 5459 5460 # Parquet 5461 def annotation_parquet(self, threads: int = None) -> None: 5462 """ 5463 It takes a VCF file, and annotates it with a parquet file 5464 5465 :param threads: number of threads to use for the annotation 5466 :return: the value of the variable "result". 5467 """ 5468 5469 # DEBUG 5470 log.debug("Start annotation with parquet databases") 5471 5472 # Threads 5473 if not threads: 5474 threads = self.get_threads() 5475 log.debug("Threads: " + str(threads)) 5476 5477 # DEBUG 5478 delete_tmp = True 5479 if self.get_config().get("verbosity", "warning") in ["debug"]: 5480 delete_tmp = False 5481 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5482 5483 # Config 5484 databases_folders = set( 5485 self.get_config() 5486 .get("folders", {}) 5487 .get("databases", {}) 5488 .get("annotations", ["."]) 5489 + self.get_config() 5490 .get("folders", {}) 5491 .get("databases", {}) 5492 .get("parquet", ["."]) 5493 ) 5494 log.debug("Databases annotations: " + str(databases_folders)) 5495 5496 # Param 5497 annotations = ( 5498 self.get_param() 5499 .get("annotation", {}) 5500 .get("parquet", {}) 5501 .get("annotations", None) 5502 ) 5503 log.debug("Annotations: " + str(annotations)) 5504 5505 # Assembly 5506 assembly = self.get_param().get( 5507 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5508 ) 5509 5510 # Force Update Annotation 5511 force_update_annotation = ( 5512 self.get_param() 5513 .get("annotation", {}) 5514 .get("options", {}) 5515 .get("annotations_update", False) 5516 ) 5517 log.debug(f"force_update_annotation={force_update_annotation}") 5518 force_append_annotation = ( 5519 self.get_param() 5520 .get("annotation", {}) 5521 .get("options", {}) 5522 .get("annotations_append", False) 5523 ) 5524 log.debug(f"force_append_annotation={force_append_annotation}") 5525 5526 # Data 5527 table_variants = self.get_table_variants() 5528 5529 # Check if not empty 5530 log.debug("Check if not empty") 5531 sql_query_chromosomes_df = self.get_query_to_df( 5532 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5533 ) 5534 if not sql_query_chromosomes_df["count"][0]: 5535 log.info(f"VCF empty") 5536 return 5537 5538 # VCF header 5539 vcf_reader = self.get_header() 5540 log.debug("Initial header: " + str(vcf_reader.infos)) 5541 5542 # Nb Variants POS 5543 log.debug("NB Variants Start") 5544 nb_variants = self.conn.execute( 5545 f"SELECT count(*) AS count FROM variants" 5546 ).fetchdf()["count"][0] 5547 log.debug("NB Variants Stop") 5548 5549 # Existing annotations 5550 for vcf_annotation in self.get_header().infos: 5551 5552 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5553 log.debug( 5554 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5555 ) 5556 5557 # Added columns 5558 added_columns = [] 5559 5560 # drop indexes 5561 log.debug(f"Drop indexes...") 5562 self.drop_indexes() 5563 5564 if annotations: 5565 5566 if "ALL" in annotations: 5567 5568 all_param = annotations.get("ALL", {}) 5569 all_param_formats = all_param.get("formats", None) 5570 all_param_releases = all_param.get("releases", None) 5571 5572 databases_infos_dict = self.scan_databases( 5573 database_formats=all_param_formats, 5574 database_releases=all_param_releases, 5575 ) 5576 for database_infos in databases_infos_dict.keys(): 5577 if database_infos not in annotations: 5578 annotations[database_infos] = {"INFO": None} 5579 5580 for annotation in annotations: 5581 5582 if annotation in ["ALL"]: 5583 continue 5584 5585 # Annotation Name 5586 annotation_name = os.path.basename(annotation) 5587 5588 # Annotation fields 5589 annotation_fields = annotations[annotation] 5590 if not annotation_fields: 5591 annotation_fields = {"INFO": None} 5592 5593 log.debug(f"Annotation '{annotation_name}'") 5594 log.debug( 5595 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5596 ) 5597 5598 # Create Database 5599 database = Database( 5600 database=annotation, 5601 databases_folders=databases_folders, 5602 assembly=assembly, 5603 ) 5604 5605 # Find files 5606 parquet_file = database.get_database() 5607 parquet_hdr_file = database.get_header_file() 5608 parquet_type = database.get_type() 5609 5610 # Check if files exists 5611 if not parquet_file or not parquet_hdr_file: 5612 log.error("Annotation failed: file not found") 5613 raise ValueError("Annotation failed: file not found") 5614 else: 5615 # Get parquet connexion 5616 parquet_sql_attach = database.get_sql_database_attach( 5617 output="query" 5618 ) 5619 if parquet_sql_attach: 5620 self.conn.execute(parquet_sql_attach) 5621 parquet_file_link = database.get_sql_database_link() 5622 # Log 5623 log.debug( 5624 f"Annotation '{annotation_name}' - file: " 5625 + str(parquet_file) 5626 + " and " 5627 + str(parquet_hdr_file) 5628 ) 5629 5630 # Database full header columns 5631 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5632 parquet_hdr_file 5633 ) 5634 # Log 5635 log.debug( 5636 "Annotation database header columns : " 5637 + str(parquet_hdr_vcf_header_columns) 5638 ) 5639 5640 # Load header as VCF object 5641 parquet_hdr_vcf_header_infos = database.get_header().infos 5642 # Log 5643 log.debug( 5644 "Annotation database header: " 5645 + str(parquet_hdr_vcf_header_infos) 5646 ) 5647 5648 # Get extra infos 5649 parquet_columns = database.get_extra_columns() 5650 # Log 5651 log.debug("Annotation database Columns: " + str(parquet_columns)) 5652 5653 # Add extra columns if "ALL" in annotation_fields 5654 # if "ALL" in annotation_fields: 5655 # allow_add_extra_column = True 5656 if "ALL" in annotation_fields and database.get_extra_columns(): 5657 for extra_column in database.get_extra_columns(): 5658 if ( 5659 extra_column not in annotation_fields 5660 and extra_column.replace("INFO/", "") 5661 not in parquet_hdr_vcf_header_infos 5662 ): 5663 parquet_hdr_vcf_header_infos[extra_column] = ( 5664 vcf.parser._Info( 5665 extra_column, 5666 ".", 5667 "String", 5668 f"{extra_column} description", 5669 "unknown", 5670 "unknown", 5671 self.code_type_map["String"], 5672 ) 5673 ) 5674 5675 # For all fields in database 5676 annotation_fields_all = False 5677 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5678 annotation_fields_all = True 5679 annotation_fields = { 5680 key: key for key in parquet_hdr_vcf_header_infos 5681 } 5682 5683 log.debug( 5684 "Annotation database header - All annotations added: " 5685 + str(annotation_fields) 5686 ) 5687 5688 # Init 5689 5690 # List of annotation fields to use 5691 sql_query_annotation_update_info_sets = [] 5692 5693 # List of annotation to agregate 5694 sql_query_annotation_to_agregate = [] 5695 5696 # Number of fields 5697 nb_annotation_field = 0 5698 5699 # Annotation fields processed 5700 annotation_fields_processed = [] 5701 5702 # Columns mapping 5703 map_columns = database.map_columns( 5704 columns=annotation_fields, prefixes=["INFO/"] 5705 ) 5706 5707 # Query dict for fields to remove (update option) 5708 query_dict_remove = {} 5709 5710 # Fetch Anotation fields 5711 for annotation_field in annotation_fields: 5712 5713 # annotation_field_column 5714 annotation_field_column = map_columns.get( 5715 annotation_field, "INFO" 5716 ) 5717 5718 # field new name, if parametered 5719 annotation_fields_new_name = annotation_fields.get( 5720 annotation_field, annotation_field 5721 ) 5722 if not annotation_fields_new_name: 5723 annotation_fields_new_name = annotation_field 5724 5725 # To annotate 5726 # force_update_annotation = True 5727 # force_append_annotation = True 5728 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5729 if annotation_field in parquet_hdr_vcf_header_infos and ( 5730 force_update_annotation 5731 or force_append_annotation 5732 or ( 5733 annotation_fields_new_name 5734 not in self.get_header().infos 5735 ) 5736 ): 5737 5738 # Add field to annotation to process list 5739 annotation_fields_processed.append( 5740 annotation_fields_new_name 5741 ) 5742 5743 # explode infos for the field 5744 annotation_fields_new_name_info_msg = "" 5745 if ( 5746 force_update_annotation 5747 and annotation_fields_new_name 5748 in self.get_header().infos 5749 ): 5750 # Remove field from INFO 5751 query = f""" 5752 UPDATE {table_variants} as table_variants 5753 SET INFO = REGEXP_REPLACE( 5754 concat(table_variants.INFO,''), 5755 ';*{annotation_fields_new_name}=[^;]*', 5756 '' 5757 ) 5758 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5759 """ 5760 annotation_fields_new_name_info_msg = " [update]" 5761 query_dict_remove[ 5762 f"remove 'INFO/{annotation_fields_new_name}'" 5763 ] = query 5764 5765 # Sep between fields in INFO 5766 nb_annotation_field += 1 5767 if nb_annotation_field > 1: 5768 annotation_field_sep = ";" 5769 else: 5770 annotation_field_sep = "" 5771 5772 log.info( 5773 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5774 ) 5775 5776 # Add INFO field to header 5777 parquet_hdr_vcf_header_infos_number = ( 5778 parquet_hdr_vcf_header_infos[annotation_field].num 5779 or "." 5780 ) 5781 parquet_hdr_vcf_header_infos_type = ( 5782 parquet_hdr_vcf_header_infos[annotation_field].type 5783 or "String" 5784 ) 5785 parquet_hdr_vcf_header_infos_description = ( 5786 parquet_hdr_vcf_header_infos[annotation_field].desc 5787 or f"{annotation_field} description" 5788 ) 5789 parquet_hdr_vcf_header_infos_source = ( 5790 parquet_hdr_vcf_header_infos[annotation_field].source 5791 or "unknown" 5792 ) 5793 parquet_hdr_vcf_header_infos_version = ( 5794 parquet_hdr_vcf_header_infos[annotation_field].version 5795 or "unknown" 5796 ) 5797 5798 vcf_reader.infos[annotation_fields_new_name] = ( 5799 vcf.parser._Info( 5800 annotation_fields_new_name, 5801 parquet_hdr_vcf_header_infos_number, 5802 parquet_hdr_vcf_header_infos_type, 5803 parquet_hdr_vcf_header_infos_description, 5804 parquet_hdr_vcf_header_infos_source, 5805 parquet_hdr_vcf_header_infos_version, 5806 self.code_type_map[ 5807 parquet_hdr_vcf_header_infos_type 5808 ], 5809 ) 5810 ) 5811 5812 # Append 5813 if force_append_annotation: 5814 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5815 else: 5816 query_case_when_append = "" 5817 5818 # Annotation/Update query fields 5819 # Found in INFO column 5820 if ( 5821 annotation_field_column == "INFO" 5822 and "INFO" in parquet_hdr_vcf_header_columns 5823 ): 5824 sql_query_annotation_update_info_sets.append( 5825 f""" 5826 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5827 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5828 ELSE '' 5829 END 5830 """ 5831 ) 5832 # Found in a specific column 5833 else: 5834 # sql_query_annotation_update_info_sets.append( 5835 # f""" 5836 # CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5837 # THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5838 # ELSE '' 5839 # END 5840 # """ 5841 # ) 5842 sql_query_annotation_update_info_sets.append( 5843 f""" 5844 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5845 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5846 ELSE '' 5847 END 5848 """ 5849 ) 5850 sql_query_annotation_to_agregate.append( 5851 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5852 ) 5853 5854 # Not to annotate 5855 else: 5856 5857 if force_update_annotation: 5858 annotation_message = "forced" 5859 else: 5860 annotation_message = "skipped" 5861 5862 if annotation_field not in parquet_hdr_vcf_header_infos: 5863 log.warning( 5864 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5865 ) 5866 if annotation_fields_new_name in self.get_header().infos: 5867 log.warning( 5868 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5869 ) 5870 5871 # Check if ALL fields have to be annotated. Thus concat all INFO field 5872 # allow_annotation_full_info = True 5873 allow_annotation_full_info = not force_append_annotation 5874 5875 if parquet_type in ["regions"]: 5876 allow_annotation_full_info = False 5877 5878 if ( 5879 allow_annotation_full_info 5880 and nb_annotation_field == len(annotation_fields) 5881 and annotation_fields_all 5882 and ( 5883 "INFO" in parquet_hdr_vcf_header_columns 5884 and "INFO" in database.get_extra_columns() 5885 ) 5886 ): 5887 log.debug("Column INFO annotation enabled") 5888 sql_query_annotation_update_info_sets = [] 5889 sql_query_annotation_update_info_sets.append( 5890 f" table_parquet.INFO " 5891 ) 5892 5893 if sql_query_annotation_update_info_sets: 5894 5895 # Annotate 5896 log.info(f"Annotation '{annotation_name}' - Annotation...") 5897 5898 # Join query annotation update info sets for SQL 5899 sql_query_annotation_update_info_sets_sql = ",".join( 5900 sql_query_annotation_update_info_sets 5901 ) 5902 5903 # Check chromosomes list (and variants infos) 5904 sql_query_chromosomes = f""" 5905 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5906 FROM {table_variants} as table_variants 5907 GROUP BY table_variants."#CHROM" 5908 ORDER BY table_variants."#CHROM" 5909 """ 5910 sql_query_chromosomes_df = self.conn.execute( 5911 sql_query_chromosomes 5912 ).df() 5913 sql_query_chromosomes_dict = { 5914 entry["CHROM"]: { 5915 "count": entry["count_variants"], 5916 "min": entry["min_variants"], 5917 "max": entry["max_variants"], 5918 } 5919 for index, entry in sql_query_chromosomes_df.iterrows() 5920 } 5921 5922 # Init 5923 nb_of_query = 0 5924 nb_of_variant_annotated = 0 5925 query_dict = query_dict_remove 5926 5927 # for chrom in sql_query_chromosomes_df["CHROM"]: 5928 for chrom in sql_query_chromosomes_dict: 5929 5930 # Number of variant by chromosome 5931 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5932 chrom, {} 5933 ).get("count", 0) 5934 5935 log.debug( 5936 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5937 ) 5938 5939 # Annotation with regions database 5940 if parquet_type in ["regions"]: 5941 sql_query_annotation_from_clause = f""" 5942 FROM ( 5943 SELECT 5944 '{chrom}' AS \"#CHROM\", 5945 table_variants_from.\"POS\" AS \"POS\", 5946 {",".join(sql_query_annotation_to_agregate)} 5947 FROM {table_variants} as table_variants_from 5948 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5949 table_parquet_from."#CHROM" = '{chrom}' 5950 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5951 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5952 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5953 ) 5954 ) 5955 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5956 GROUP BY table_variants_from.\"POS\" 5957 ) 5958 as table_parquet 5959 """ 5960 5961 sql_query_annotation_where_clause = """ 5962 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5963 AND table_parquet.\"POS\" = table_variants.\"POS\" 5964 """ 5965 5966 # Annotation with variants database 5967 else: 5968 sql_query_annotation_from_clause = f""" 5969 FROM {parquet_file_link} as table_parquet 5970 """ 5971 sql_query_annotation_where_clause = f""" 5972 table_variants."#CHROM" = '{chrom}' 5973 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5974 AND table_parquet.\"POS\" = table_variants.\"POS\" 5975 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5976 AND table_parquet.\"REF\" = table_variants.\"REF\" 5977 """ 5978 5979 # Create update query 5980 sql_query_annotation_chrom_interval_pos = f""" 5981 UPDATE {table_variants} as table_variants 5982 SET INFO = 5983 concat( 5984 CASE WHEN table_variants.INFO NOT IN ('','.') 5985 THEN table_variants.INFO 5986 ELSE '' 5987 END 5988 , 5989 CASE WHEN table_variants.INFO NOT IN ('','.') 5990 AND ( 5991 concat({sql_query_annotation_update_info_sets_sql}) 5992 ) 5993 NOT IN ('','.') 5994 THEN ';' 5995 ELSE '' 5996 END 5997 , 5998 {sql_query_annotation_update_info_sets_sql} 5999 ) 6000 {sql_query_annotation_from_clause} 6001 WHERE {sql_query_annotation_where_clause} 6002 ; 6003 """ 6004 6005 # Add update query to dict 6006 query_dict[ 6007 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6008 ] = sql_query_annotation_chrom_interval_pos 6009 6010 nb_of_query = len(query_dict) 6011 num_query = 0 6012 6013 # SET max_expression_depth TO x 6014 self.conn.execute("SET max_expression_depth TO 10000") 6015 6016 for query_name in query_dict: 6017 query = query_dict[query_name] 6018 num_query += 1 6019 log.info( 6020 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6021 ) 6022 result = self.conn.execute(query) 6023 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6024 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6025 log.info( 6026 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6027 ) 6028 6029 log.info( 6030 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6031 ) 6032 6033 else: 6034 6035 log.info( 6036 f"Annotation '{annotation_name}' - No Annotations available" 6037 ) 6038 6039 log.debug("Final header: " + str(vcf_reader.infos)) 6040 6041 # Remove added columns 6042 for added_column in added_columns: 6043 self.drop_column(column=added_column) 6044 6045 def annotation_splice(self, threads: int = None) -> None: 6046 """ 6047 This function annotate with snpEff 6048 6049 :param threads: The number of threads to use 6050 :return: the value of the variable "return_value". 6051 """ 6052 6053 # DEBUG 6054 log.debug("Start annotation with splice tools") 6055 6056 # Threads 6057 if not threads: 6058 threads = self.get_threads() 6059 log.debug("Threads: " + str(threads)) 6060 6061 # DEBUG 6062 delete_tmp = True 6063 if self.get_config().get("verbosity", "warning") in ["debug"]: 6064 delete_tmp = False 6065 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6066 6067 # Config 6068 config = self.get_config() 6069 log.debug("Config: " + str(config)) 6070 splice_config = config.get("tools", {}).get("splice", {}) 6071 if not splice_config: 6072 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6073 if not splice_config: 6074 msg_err = "No Splice tool config" 6075 log.error(msg_err) 6076 raise ValueError(msg_err) 6077 log.debug(f"splice_config={splice_config}") 6078 6079 # Config - Folders - Databases 6080 databases_folders = ( 6081 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6082 ) 6083 log.debug("Databases annotations: " + str(databases_folders)) 6084 6085 # Splice docker image 6086 splice_docker_image = splice_config.get("docker").get("image") 6087 6088 # Pull splice image if it's not already there 6089 if not check_docker_image_exists(splice_docker_image): 6090 log.warning( 6091 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6092 ) 6093 try: 6094 command(f"docker pull {splice_config.get('docker').get('image')}") 6095 except subprocess.CalledProcessError: 6096 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6097 log.error(msg_err) 6098 raise ValueError(msg_err) 6099 return None 6100 6101 # Config - splice databases 6102 splice_databases = ( 6103 config.get("folders", {}) 6104 .get("databases", {}) 6105 .get("splice", DEFAULT_SPLICE_FOLDER) 6106 ) 6107 splice_databases = full_path(splice_databases) 6108 6109 # Param 6110 param = self.get_param() 6111 log.debug("Param: " + str(param)) 6112 6113 # Param 6114 options = param.get("annotation", {}).get("splice", {}) 6115 log.debug("Options: " + str(options)) 6116 6117 # Data 6118 table_variants = self.get_table_variants() 6119 6120 # Check if not empty 6121 log.debug("Check if not empty") 6122 sql_query_chromosomes = ( 6123 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6124 ) 6125 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6126 log.info("VCF empty") 6127 return None 6128 6129 # Export in VCF 6130 log.debug("Create initial file to annotate") 6131 6132 # Create output folder 6133 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6134 if not os.path.exists(output_folder): 6135 Path(output_folder).mkdir(parents=True, exist_ok=True) 6136 6137 # Create tmp VCF file 6138 tmp_vcf = NamedTemporaryFile( 6139 prefix=self.get_prefix(), 6140 dir=output_folder, 6141 suffix=".vcf", 6142 delete=False, 6143 ) 6144 tmp_vcf_name = tmp_vcf.name 6145 6146 # VCF header 6147 header = self.get_header() 6148 6149 # Existing annotations 6150 for vcf_annotation in self.get_header().infos: 6151 6152 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6153 log.debug( 6154 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6155 ) 6156 6157 # Memory limit 6158 if config.get("memory", None): 6159 memory_limit = config.get("memory", "8G").upper() 6160 # upper() 6161 else: 6162 memory_limit = "8G" 6163 log.debug(f"memory_limit: {memory_limit}") 6164 6165 # Check number of variants to annotate 6166 where_clause_regex_spliceai = r"SpliceAI_\w+" 6167 where_clause_regex_spip = r"SPiP_\w+" 6168 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6169 df_list_of_variants_to_annotate = self.get_query_to_df( 6170 query=f""" SELECT * FROM variants {where_clause} """ 6171 ) 6172 if len(df_list_of_variants_to_annotate) == 0: 6173 log.warning( 6174 f"No variants to annotate with splice. Variants probably already annotated with splice" 6175 ) 6176 return None 6177 else: 6178 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6179 6180 # Export VCF file 6181 self.export_variant_vcf( 6182 vcf_file=tmp_vcf_name, 6183 remove_info=True, 6184 add_samples=True, 6185 index=False, 6186 where_clause=where_clause, 6187 ) 6188 6189 # Create docker container and launch splice analysis 6190 if splice_config: 6191 6192 # Splice mount folders 6193 mount_folders = splice_config.get("mount", {}) 6194 6195 # Genome mount 6196 mount_folders[ 6197 config.get("folders", {}) 6198 .get("databases", {}) 6199 .get("genomes", DEFAULT_GENOME_FOLDER) 6200 ] = "ro" 6201 6202 # SpliceAI mount 6203 mount_folders[ 6204 config.get("folders", {}) 6205 .get("databases", {}) 6206 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6207 ] = "ro" 6208 6209 # Genome mount 6210 mount_folders[ 6211 config.get("folders", {}) 6212 .get("databases", {}) 6213 .get("spip", DEFAULT_SPIP_FOLDER) 6214 ] = "ro" 6215 6216 # Mount folders 6217 mount = [] 6218 6219 # Config mount 6220 mount = [ 6221 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6222 for path, mode in mount_folders.items() 6223 ] 6224 6225 if any(value for value in splice_config.values() if value is None): 6226 log.warning("At least one splice config parameter is empty") 6227 return None 6228 6229 # Params in splice nf 6230 def check_values(dico: dict): 6231 """ 6232 Ensure parameters for NF splice pipeline 6233 """ 6234 for key, val in dico.items(): 6235 if key == "genome": 6236 if any( 6237 assemb in options.get("genome", {}) 6238 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6239 ): 6240 yield f"--{key} hg19" 6241 elif any( 6242 assemb in options.get("genome", {}) 6243 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6244 ): 6245 yield f"--{key} hg38" 6246 elif ( 6247 (isinstance(val, str) and val) 6248 or isinstance(val, int) 6249 or isinstance(val, bool) 6250 ): 6251 yield f"--{key} {val}" 6252 6253 # Genome 6254 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6255 options["genome"] = genome 6256 6257 # NF params 6258 nf_params = [] 6259 6260 # Add options 6261 if options: 6262 nf_params = list(check_values(options)) 6263 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6264 else: 6265 log.debug("No NF params provided") 6266 6267 # Add threads 6268 if "threads" not in options.keys(): 6269 nf_params.append(f"--threads {threads}") 6270 6271 # Genome path 6272 genome_path = find_genome( 6273 config.get("folders", {}) 6274 .get("databases", {}) 6275 .get("genomes", DEFAULT_GENOME_FOLDER), 6276 file=f"{genome}.fa", 6277 ) 6278 # Add genome path 6279 if not genome_path: 6280 raise ValueError( 6281 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6282 ) 6283 else: 6284 log.debug(f"Genome: {genome_path}") 6285 nf_params.append(f"--genome_path {genome_path}") 6286 6287 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6288 """ 6289 Setting up updated databases for SPiP and SpliceAI 6290 """ 6291 6292 try: 6293 6294 # SpliceAI assembly transcriptome 6295 spliceai_assembly = os.path.join( 6296 config.get("folders", {}) 6297 .get("databases", {}) 6298 .get("spliceai", {}), 6299 options.get("genome"), 6300 "transcriptome", 6301 ) 6302 spip_assembly = options.get("genome") 6303 6304 spip = find( 6305 f"transcriptome_{spip_assembly}.RData", 6306 config.get("folders", {}).get("databases", {}).get("spip", {}), 6307 ) 6308 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6309 log.debug(f"SPiP annotations: {spip}") 6310 log.debug(f"SpliceAI annotations: {spliceai}") 6311 if spip and spliceai: 6312 return [ 6313 f"--spip_transcriptome {spip}", 6314 f"--spliceai_annotations {spliceai}", 6315 ] 6316 else: 6317 # TODO crash and go on with basic annotations ? 6318 # raise ValueError( 6319 # "Can't find splice databases in configuration EXIT" 6320 # ) 6321 log.warning( 6322 "Can't find splice databases in configuration, use annotations file from image" 6323 ) 6324 except TypeError: 6325 log.warning( 6326 "Can't find splice databases in configuration, use annotations file from image" 6327 ) 6328 return [] 6329 6330 # Add options, check if transcriptome option have already beend provided 6331 if ( 6332 "spip_transcriptome" not in nf_params 6333 and "spliceai_transcriptome" not in nf_params 6334 ): 6335 splice_reference = splice_annotations(options, config) 6336 if splice_reference: 6337 nf_params.extend(splice_reference) 6338 6339 nf_params.append(f"--output_folder {output_folder}") 6340 6341 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6342 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6343 log.debug(cmd) 6344 6345 splice_config["docker"]["command"] = cmd 6346 6347 docker_cmd = get_bin_command( 6348 tool="splice", 6349 bin_type="docker", 6350 config=config, 6351 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6352 add_options=f"--name {random_uuid} {' '.join(mount)}", 6353 ) 6354 6355 # Docker debug 6356 # if splice_config.get("rm_container"): 6357 # rm_container = "--rm" 6358 # else: 6359 # rm_container = "" 6360 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6361 6362 log.debug(docker_cmd) 6363 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6364 log.debug(res.stdout) 6365 if res.stderr: 6366 log.error(res.stderr) 6367 res.check_returncode() 6368 else: 6369 log.warning(f"Splice tool configuration not found: {config}") 6370 6371 # Update variants 6372 log.info("Annotation - Updating...") 6373 # Test find output vcf 6374 log.debug( 6375 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6376 ) 6377 output_vcf = [] 6378 # Wrong folder to look in 6379 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6380 if ( 6381 files 6382 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6383 ): 6384 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6385 # log.debug(os.listdir(options.get("output_folder"))) 6386 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6387 if not output_vcf: 6388 log.debug( 6389 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6390 ) 6391 else: 6392 # Get new header from annotated vcf 6393 log.debug(f"Initial header: {len(header.infos)} fields") 6394 # Create new header with splice infos 6395 new_vcf = Variants(input=output_vcf[0]) 6396 new_vcf_header = new_vcf.get_header().infos 6397 for keys, infos in new_vcf_header.items(): 6398 if keys not in header.infos.keys(): 6399 header.infos[keys] = infos 6400 log.debug(f"New header: {len(header.infos)} fields") 6401 log.debug(f"Splice tmp output: {output_vcf[0]}") 6402 self.update_from_vcf(output_vcf[0]) 6403 6404 # Remove folder 6405 remove_if_exists(output_folder) 6406 6407 ### 6408 # Prioritization 6409 ### 6410 6411 def get_config_default(self, name: str) -> dict: 6412 """ 6413 The function `get_config_default` returns a dictionary containing default configurations for 6414 various calculations and prioritizations. 6415 6416 :param name: The `get_config_default` function returns a dictionary containing default 6417 configurations for different calculations and prioritizations. The `name` parameter is used to 6418 specify which specific configuration to retrieve from the dictionary 6419 :type name: str 6420 :return: The function `get_config_default` returns a dictionary containing default configuration 6421 settings for different calculations and prioritizations. The specific configuration settings are 6422 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6423 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6424 returned. If there is no match, an empty dictionary is returned. 6425 """ 6426 6427 config_default = { 6428 "calculations": { 6429 "variant_chr_pos_alt_ref": { 6430 "type": "sql", 6431 "name": "variant_chr_pos_alt_ref", 6432 "description": "Create a variant ID with chromosome, position, alt and ref", 6433 "available": False, 6434 "output_column_name": "variant_chr_pos_alt_ref", 6435 "output_column_type": "String", 6436 "output_column_description": "variant ID with chromosome, position, alt and ref", 6437 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6438 "operation_info": True, 6439 }, 6440 "VARTYPE": { 6441 "type": "sql", 6442 "name": "VARTYPE", 6443 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6444 "available": True, 6445 "output_column_name": "VARTYPE", 6446 "output_column_type": "String", 6447 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6448 "operation_query": """ 6449 CASE 6450 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6451 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6452 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6453 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6454 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6455 ELSE 'UNDEFINED' 6456 END 6457 """, 6458 "info_fields": ["SVTYPE"], 6459 "operation_info": True, 6460 }, 6461 "snpeff_hgvs": { 6462 "type": "python", 6463 "name": "snpeff_hgvs", 6464 "description": "HGVS nomenclatures from snpEff annotation", 6465 "available": True, 6466 "function_name": "calculation_extract_snpeff_hgvs", 6467 "function_params": ["snpeff_hgvs", "ANN"], 6468 }, 6469 "snpeff_ann_explode": { 6470 "type": "python", 6471 "name": "snpeff_ann_explode", 6472 "description": "Explode snpEff annotations with uniquify values", 6473 "available": True, 6474 "function_name": "calculation_snpeff_ann_explode", 6475 "function_params": [False, "fields", "snpeff_", "ANN"], 6476 }, 6477 "snpeff_ann_explode_uniquify": { 6478 "type": "python", 6479 "name": "snpeff_ann_explode_uniquify", 6480 "description": "Explode snpEff annotations", 6481 "available": True, 6482 "function_name": "calculation_snpeff_ann_explode", 6483 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6484 }, 6485 "snpeff_ann_explode_json": { 6486 "type": "python", 6487 "name": "snpeff_ann_explode_json", 6488 "description": "Explode snpEff annotations in JSON format", 6489 "available": True, 6490 "function_name": "calculation_snpeff_ann_explode", 6491 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6492 }, 6493 "NOMEN": { 6494 "type": "python", 6495 "name": "NOMEN", 6496 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6497 "available": True, 6498 "function_name": "calculation_extract_nomen", 6499 "function_params": [], 6500 }, 6501 "FINDBYPIPELINE": { 6502 "type": "python", 6503 "name": "FINDBYPIPELINE", 6504 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6505 "available": True, 6506 "function_name": "calculation_find_by_pipeline", 6507 "function_params": ["findbypipeline"], 6508 }, 6509 "FINDBYSAMPLE": { 6510 "type": "python", 6511 "name": "FINDBYSAMPLE", 6512 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6513 "available": True, 6514 "function_name": "calculation_find_by_pipeline", 6515 "function_params": ["findbysample"], 6516 }, 6517 "GENOTYPECONCORDANCE": { 6518 "type": "python", 6519 "name": "GENOTYPECONCORDANCE", 6520 "description": "Concordance of genotype for multi caller VCF", 6521 "available": True, 6522 "function_name": "calculation_genotype_concordance", 6523 "function_params": [], 6524 }, 6525 "BARCODE": { 6526 "type": "python", 6527 "name": "BARCODE", 6528 "description": "BARCODE as VaRank tool", 6529 "available": True, 6530 "function_name": "calculation_barcode", 6531 "function_params": [], 6532 }, 6533 "BARCODEFAMILY": { 6534 "type": "python", 6535 "name": "BARCODEFAMILY", 6536 "description": "BARCODEFAMILY as VaRank tool", 6537 "available": True, 6538 "function_name": "calculation_barcode_family", 6539 "function_params": ["BCF"], 6540 }, 6541 "TRIO": { 6542 "type": "python", 6543 "name": "TRIO", 6544 "description": "Inheritance for a trio family", 6545 "available": True, 6546 "function_name": "calculation_trio", 6547 "function_params": [], 6548 }, 6549 "VAF": { 6550 "type": "python", 6551 "name": "VAF", 6552 "description": "Variant Allele Frequency (VAF) harmonization", 6553 "available": True, 6554 "function_name": "calculation_vaf_normalization", 6555 "function_params": [], 6556 }, 6557 "VAF_stats": { 6558 "type": "python", 6559 "name": "VAF_stats", 6560 "description": "Variant Allele Frequency (VAF) statistics", 6561 "available": True, 6562 "function_name": "calculation_genotype_stats", 6563 "function_params": ["VAF"], 6564 }, 6565 "DP_stats": { 6566 "type": "python", 6567 "name": "DP_stats", 6568 "description": "Depth (DP) statistics", 6569 "available": True, 6570 "function_name": "calculation_genotype_stats", 6571 "function_params": ["DP"], 6572 }, 6573 "variant_id": { 6574 "type": "python", 6575 "name": "variant_id", 6576 "description": "Variant ID generated from variant position and type", 6577 "available": True, 6578 "function_name": "calculation_variant_id", 6579 "function_params": [], 6580 }, 6581 "transcripts_json": { 6582 "type": "python", 6583 "name": "transcripts_json", 6584 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6585 "available": True, 6586 "function_name": "calculation_transcripts_annotation", 6587 "function_params": ["transcripts_json", None], 6588 }, 6589 "transcripts_ann": { 6590 "type": "python", 6591 "name": "transcripts_ann", 6592 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6593 "available": True, 6594 "function_name": "calculation_transcripts_annotation", 6595 "function_params": [None, "transcripts_ann"], 6596 }, 6597 "transcripts_annotations": { 6598 "type": "python", 6599 "name": "transcripts_annotations", 6600 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6601 "available": True, 6602 "function_name": "calculation_transcripts_annotation", 6603 "function_params": [None, None], 6604 }, 6605 "transcripts_prioritization": { 6606 "type": "python", 6607 "name": "transcripts_prioritization", 6608 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6609 "available": True, 6610 "function_name": "calculation_transcripts_prioritization", 6611 "function_params": [], 6612 }, 6613 }, 6614 "prioritizations": { 6615 "default": { 6616 "filter": [ 6617 { 6618 "type": "notequals", 6619 "value": "!PASS|\\.", 6620 "score": 0, 6621 "flag": "FILTERED", 6622 "comment": ["Bad variant quality"], 6623 }, 6624 { 6625 "type": "equals", 6626 "value": "REJECT", 6627 "score": -20, 6628 "flag": "PASS", 6629 "comment": ["Bad variant quality"], 6630 }, 6631 ], 6632 "DP": [ 6633 { 6634 "type": "gte", 6635 "value": "50", 6636 "score": 5, 6637 "flag": "PASS", 6638 "comment": ["DP higher than 50"], 6639 } 6640 ], 6641 "ANN": [ 6642 { 6643 "type": "contains", 6644 "value": "HIGH", 6645 "score": 5, 6646 "flag": "PASS", 6647 "comment": [ 6648 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6649 ], 6650 }, 6651 { 6652 "type": "contains", 6653 "value": "MODERATE", 6654 "score": 3, 6655 "flag": "PASS", 6656 "comment": [ 6657 "A non-disruptive variant that might change protein effectiveness" 6658 ], 6659 }, 6660 { 6661 "type": "contains", 6662 "value": "LOW", 6663 "score": 0, 6664 "flag": "FILTERED", 6665 "comment": [ 6666 "Assumed to be mostly harmless or unlikely to change protein behavior" 6667 ], 6668 }, 6669 { 6670 "type": "contains", 6671 "value": "MODIFIER", 6672 "score": 0, 6673 "flag": "FILTERED", 6674 "comment": [ 6675 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6676 ], 6677 }, 6678 ], 6679 } 6680 }, 6681 } 6682 6683 return config_default.get(name, None) 6684 6685 def get_config_json( 6686 self, name: str, config_dict: dict = {}, config_file: str = None 6687 ) -> dict: 6688 """ 6689 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6690 default values, a dictionary, and a file. 6691 6692 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6693 the name of the configuration. It is used to identify and retrieve the configuration settings 6694 for a specific component or module 6695 :type name: str 6696 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6697 dictionary that allows you to provide additional configuration settings or overrides. When you 6698 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6699 the key is the configuration setting you want to override or 6700 :type config_dict: dict 6701 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6702 specify the path to a configuration file that contains additional settings. If provided, the 6703 function will read the contents of this file and update the configuration dictionary with the 6704 values found in the file, overriding any existing values with the 6705 :type config_file: str 6706 :return: The function `get_config_json` returns a dictionary containing the configuration 6707 settings. 6708 """ 6709 6710 # Create with default prioritizations 6711 config_default = self.get_config_default(name=name) 6712 configuration = config_default 6713 # log.debug(f"configuration={configuration}") 6714 6715 # Replace prioritizations from dict 6716 for config in config_dict: 6717 configuration[config] = config_dict[config] 6718 6719 # Replace prioritizations from file 6720 config_file = full_path(config_file) 6721 if config_file: 6722 if os.path.exists(config_file): 6723 with open(config_file) as config_file_content: 6724 config_file_dict = json.load(config_file_content) 6725 for config in config_file_dict: 6726 configuration[config] = config_file_dict[config] 6727 else: 6728 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6729 log.error(msg_error) 6730 raise ValueError(msg_error) 6731 6732 return configuration 6733 6734 def prioritization( 6735 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6736 ) -> bool: 6737 """ 6738 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6739 prioritizes variants based on configured profiles and criteria. 6740 6741 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6742 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6743 a table name is provided, the method will prioritize the variants in that specific table 6744 :type table: str 6745 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6746 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6747 provided, the code will use a default prefix value of "PZ" 6748 :type pz_prefix: str 6749 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6750 additional parameters specific to the prioritization process. These parameters can include 6751 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6752 configurations needed for the prioritization of variants in a V 6753 :type pz_param: dict 6754 :return: A boolean value (True) is being returned from the `prioritization` function. 6755 """ 6756 6757 # Config 6758 config = self.get_config() 6759 6760 # Param 6761 param = self.get_param() 6762 6763 # Prioritization param 6764 if pz_param is not None: 6765 prioritization_param = pz_param 6766 else: 6767 prioritization_param = param.get("prioritization", {}) 6768 6769 # Configuration profiles 6770 prioritization_config_file = prioritization_param.get( 6771 "prioritization_config", None 6772 ) 6773 prioritization_config_file = full_path(prioritization_config_file) 6774 prioritizations_config = self.get_config_json( 6775 name="prioritizations", config_file=prioritization_config_file 6776 ) 6777 6778 # Prioritization prefix 6779 pz_prefix_default = "PZ" 6780 if pz_prefix is None: 6781 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6782 6783 # Prioritization options 6784 profiles = prioritization_param.get("profiles", []) 6785 if isinstance(profiles, str): 6786 profiles = profiles.split(",") 6787 pzfields = prioritization_param.get( 6788 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6789 ) 6790 if isinstance(pzfields, str): 6791 pzfields = pzfields.split(",") 6792 default_profile = prioritization_param.get("default_profile", None) 6793 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6794 prioritization_score_mode = prioritization_param.get( 6795 "prioritization_score_mode", "HOWARD" 6796 ) 6797 6798 # Quick Prioritizations 6799 prioritizations = param.get("prioritizations", None) 6800 if prioritizations: 6801 log.info("Quick Prioritization:") 6802 for profile in prioritizations.split(","): 6803 if profile not in profiles: 6804 profiles.append(profile) 6805 log.info(f" {profile}") 6806 6807 # If profile "ALL" provided, all profiles in the config profiles 6808 if "ALL" in profiles: 6809 profiles = list(prioritizations_config.keys()) 6810 6811 for profile in profiles: 6812 if prioritizations_config.get(profile, None): 6813 log.debug(f"Profile '{profile}' configured") 6814 else: 6815 msg_error = f"Profile '{profile}' NOT configured" 6816 log.error(msg_error) 6817 raise ValueError(msg_error) 6818 6819 if profiles: 6820 log.info(f"Prioritization... ") 6821 else: 6822 log.debug(f"No profile defined") 6823 return False 6824 6825 if not default_profile and len(profiles): 6826 default_profile = profiles[0] 6827 6828 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6829 log.debug("Profiles to check: " + str(list(profiles))) 6830 6831 # Variables 6832 if table is not None: 6833 table_variants = table 6834 else: 6835 table_variants = self.get_table_variants(clause="update") 6836 log.debug(f"Table to prioritize: {table_variants}") 6837 6838 # Added columns 6839 added_columns = [] 6840 6841 # Create list of PZfields 6842 # List of PZFields 6843 list_of_pzfields_original = pzfields + [ 6844 pzfield + pzfields_sep + profile 6845 for pzfield in pzfields 6846 for profile in profiles 6847 ] 6848 list_of_pzfields = [] 6849 log.debug(f"{list_of_pzfields_original}") 6850 6851 # Remove existing PZfields to use if exists 6852 for pzfield in list_of_pzfields_original: 6853 if self.get_header().infos.get(pzfield, None) is None: 6854 list_of_pzfields.append(pzfield) 6855 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6856 else: 6857 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6858 6859 if list_of_pzfields: 6860 6861 # Explode Infos prefix 6862 explode_infos_prefix = self.get_explode_infos_prefix() 6863 6864 # PZfields tags description 6865 PZfields_INFOS = { 6866 f"{pz_prefix}Tags": { 6867 "ID": f"{pz_prefix}Tags", 6868 "Number": ".", 6869 "Type": "String", 6870 "Description": "Variant tags based on annotation criteria", 6871 }, 6872 f"{pz_prefix}Score": { 6873 "ID": f"{pz_prefix}Score", 6874 "Number": 1, 6875 "Type": "Integer", 6876 "Description": "Variant score based on annotation criteria", 6877 }, 6878 f"{pz_prefix}Flag": { 6879 "ID": f"{pz_prefix}Flag", 6880 "Number": 1, 6881 "Type": "String", 6882 "Description": "Variant flag based on annotation criteria", 6883 }, 6884 f"{pz_prefix}Comment": { 6885 "ID": f"{pz_prefix}Comment", 6886 "Number": ".", 6887 "Type": "String", 6888 "Description": "Variant comment based on annotation criteria", 6889 }, 6890 f"{pz_prefix}Infos": { 6891 "ID": f"{pz_prefix}Infos", 6892 "Number": ".", 6893 "Type": "String", 6894 "Description": "Variant infos based on annotation criteria", 6895 }, 6896 } 6897 6898 # Create INFO fields if not exist 6899 for field in PZfields_INFOS: 6900 field_ID = PZfields_INFOS[field]["ID"] 6901 field_description = PZfields_INFOS[field]["Description"] 6902 if field_ID not in self.get_header().infos and field_ID in pzfields: 6903 field_description = ( 6904 PZfields_INFOS[field]["Description"] 6905 + f", profile {default_profile}" 6906 ) 6907 self.get_header().infos[field_ID] = vcf.parser._Info( 6908 field_ID, 6909 PZfields_INFOS[field]["Number"], 6910 PZfields_INFOS[field]["Type"], 6911 field_description, 6912 "unknown", 6913 "unknown", 6914 code_type_map[PZfields_INFOS[field]["Type"]], 6915 ) 6916 6917 # Create INFO fields if not exist for each profile 6918 for profile in prioritizations_config: 6919 if profile in profiles or profiles == []: 6920 for field in PZfields_INFOS: 6921 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6922 field_description = ( 6923 PZfields_INFOS[field]["Description"] 6924 + f", profile {profile}" 6925 ) 6926 if ( 6927 field_ID not in self.get_header().infos 6928 and field in pzfields 6929 ): 6930 self.get_header().infos[field_ID] = vcf.parser._Info( 6931 field_ID, 6932 PZfields_INFOS[field]["Number"], 6933 PZfields_INFOS[field]["Type"], 6934 field_description, 6935 "unknown", 6936 "unknown", 6937 code_type_map[PZfields_INFOS[field]["Type"]], 6938 ) 6939 6940 # Header 6941 for pzfield in list_of_pzfields: 6942 if re.match(f"{pz_prefix}Score.*", pzfield): 6943 added_column = self.add_column( 6944 table_name=table_variants, 6945 column_name=pzfield, 6946 column_type="INTEGER", 6947 default_value="0", 6948 ) 6949 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6950 added_column = self.add_column( 6951 table_name=table_variants, 6952 column_name=pzfield, 6953 column_type="BOOLEAN", 6954 default_value="1", 6955 ) 6956 else: 6957 added_column = self.add_column( 6958 table_name=table_variants, 6959 column_name=pzfield, 6960 column_type="STRING", 6961 default_value="''", 6962 ) 6963 added_columns.append(added_column) 6964 6965 # Profiles 6966 if profiles: 6967 6968 # foreach profile in configuration file 6969 for profile in prioritizations_config: 6970 6971 # If profile is asked in param, or ALL are asked (empty profile []) 6972 if profile in profiles or profiles == []: 6973 log.info(f"Profile '{profile}'") 6974 6975 sql_set_info_option = "" 6976 6977 sql_set_info = [] 6978 6979 # PZ fields set 6980 6981 # PZScore 6982 if ( 6983 f"{pz_prefix}Score{pzfields_sep}{profile}" 6984 in list_of_pzfields 6985 ): 6986 sql_set_info.append( 6987 f""" 6988 concat( 6989 '{pz_prefix}Score{pzfields_sep}{profile}=', 6990 {pz_prefix}Score{pzfields_sep}{profile} 6991 ) 6992 """ 6993 ) 6994 if ( 6995 profile == default_profile 6996 and f"{pz_prefix}Score" in list_of_pzfields 6997 ): 6998 sql_set_info.append( 6999 f""" 7000 concat( 7001 '{pz_prefix}Score=', 7002 {pz_prefix}Score{pzfields_sep}{profile} 7003 ) 7004 """ 7005 ) 7006 7007 # PZFlag 7008 if ( 7009 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7010 in list_of_pzfields 7011 ): 7012 sql_set_info.append( 7013 f""" 7014 concat( 7015 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7016 CASE 7017 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7018 THEN 'PASS' 7019 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7020 THEN 'FILTERED' 7021 END 7022 ) 7023 """ 7024 ) 7025 if ( 7026 profile == default_profile 7027 and f"{pz_prefix}Flag" in list_of_pzfields 7028 ): 7029 sql_set_info.append( 7030 f""" 7031 concat( 7032 '{pz_prefix}Flag=', 7033 CASE 7034 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7035 THEN 'PASS' 7036 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7037 THEN 'FILTERED' 7038 END 7039 ) 7040 """ 7041 ) 7042 7043 # PZComment 7044 if ( 7045 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7046 in list_of_pzfields 7047 ): 7048 sql_set_info.append( 7049 f""" 7050 CASE 7051 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7052 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7053 ELSE '' 7054 END 7055 """ 7056 ) 7057 if ( 7058 profile == default_profile 7059 and f"{pz_prefix}Comment" in list_of_pzfields 7060 ): 7061 sql_set_info.append( 7062 f""" 7063 CASE 7064 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7065 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7066 ELSE '' 7067 END 7068 """ 7069 ) 7070 7071 # PZInfos 7072 if ( 7073 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7074 in list_of_pzfields 7075 ): 7076 sql_set_info.append( 7077 f""" 7078 CASE 7079 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7080 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7081 ELSE '' 7082 END 7083 """ 7084 ) 7085 if ( 7086 profile == default_profile 7087 and f"{pz_prefix}Infos" in list_of_pzfields 7088 ): 7089 sql_set_info.append( 7090 f""" 7091 CASE 7092 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7093 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7094 ELSE '' 7095 END 7096 """ 7097 ) 7098 7099 # Merge PZfields 7100 sql_set_info_option = "" 7101 sql_set_sep = "" 7102 for sql_set in sql_set_info: 7103 if sql_set_sep: 7104 sql_set_info_option += f""" 7105 , concat('{sql_set_sep}', {sql_set}) 7106 """ 7107 else: 7108 sql_set_info_option += f""" 7109 , {sql_set} 7110 """ 7111 sql_set_sep = ";" 7112 7113 sql_queries = [] 7114 for annotation in prioritizations_config[profile]: 7115 7116 # Explode specific annotation 7117 log.debug(f"Explode annotation '{annotation}'") 7118 added_columns += self.explode_infos( 7119 prefix=explode_infos_prefix, 7120 fields=[annotation], 7121 table=table_variants, 7122 ) 7123 extra_infos = self.get_extra_infos(table=table_variants) 7124 7125 # Check if annotation field is present 7126 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 7127 log.debug(f"Annotation '{annotation}' not in data") 7128 continue 7129 else: 7130 log.debug(f"Annotation '{annotation}' in data") 7131 7132 # For each criterions 7133 for criterion in prioritizations_config[profile][ 7134 annotation 7135 ]: 7136 criterion_type = criterion["type"] 7137 criterion_value = criterion["value"] 7138 criterion_score = criterion.get("score", 0) 7139 criterion_flag = criterion.get("flag", "PASS") 7140 criterion_flag_bool = criterion_flag == "PASS" 7141 criterion_comment = ( 7142 ", ".join(criterion.get("comment", [])) 7143 .replace("'", "''") 7144 .replace(";", ",") 7145 .replace("\t", " ") 7146 ) 7147 criterion_infos = ( 7148 str(criterion) 7149 .replace("'", "''") 7150 .replace(";", ",") 7151 .replace("\t", " ") 7152 ) 7153 7154 sql_set = [] 7155 sql_set_info = [] 7156 7157 # PZ fields set 7158 if ( 7159 f"{pz_prefix}Score{pzfields_sep}{profile}" 7160 in list_of_pzfields 7161 ): 7162 if prioritization_score_mode == "HOWARD": 7163 sql_set.append( 7164 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7165 ) 7166 elif prioritization_score_mode == "VaRank": 7167 sql_set.append( 7168 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7169 ) 7170 else: 7171 sql_set.append( 7172 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7173 ) 7174 if ( 7175 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7176 in list_of_pzfields 7177 ): 7178 sql_set.append( 7179 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7180 ) 7181 if ( 7182 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7183 in list_of_pzfields 7184 ): 7185 sql_set.append( 7186 f""" 7187 {pz_prefix}Comment{pzfields_sep}{profile} = 7188 concat( 7189 {pz_prefix}Comment{pzfields_sep}{profile}, 7190 CASE 7191 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7192 THEN ', ' 7193 ELSE '' 7194 END, 7195 '{criterion_comment}' 7196 ) 7197 """ 7198 ) 7199 if ( 7200 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7201 in list_of_pzfields 7202 ): 7203 sql_set.append( 7204 f""" 7205 {pz_prefix}Infos{pzfields_sep}{profile} = 7206 concat( 7207 {pz_prefix}Infos{pzfields_sep}{profile}, 7208 '{criterion_infos}' 7209 ) 7210 """ 7211 ) 7212 sql_set_option = ",".join(sql_set) 7213 7214 # Criterion and comparison 7215 if sql_set_option: 7216 try: 7217 float(criterion_value) 7218 sql_update = f""" 7219 UPDATE {table_variants} 7220 SET {sql_set_option} 7221 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7222 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7223 """ 7224 except: 7225 contains_option = "" 7226 if criterion_type == "contains": 7227 contains_option = ".*" 7228 sql_update = f""" 7229 UPDATE {table_variants} 7230 SET {sql_set_option} 7231 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7232 """ 7233 sql_queries.append(sql_update) 7234 else: 7235 log.warning( 7236 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7237 ) 7238 7239 # PZTags 7240 if ( 7241 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7242 in list_of_pzfields 7243 ): 7244 7245 # Create PZFalgs value 7246 pztags_value = "" 7247 pztags_sep_default = "|" 7248 pztags_sep = "" 7249 for pzfield in pzfields: 7250 if pzfield not in [f"{pz_prefix}Tags"]: 7251 if ( 7252 f"{pzfield}{pzfields_sep}{profile}" 7253 in list_of_pzfields 7254 ): 7255 if pzfield in [f"{pz_prefix}Flag"]: 7256 pztags_value += f"""{pztags_sep}{pzfield}#', 7257 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7258 THEN 'PASS' 7259 ELSE 'FILTERED' 7260 END, '""" 7261 else: 7262 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7263 pztags_sep = pztags_sep_default 7264 7265 # Add Query update for PZFlags 7266 sql_update_pztags = f""" 7267 UPDATE {table_variants} 7268 SET INFO = concat( 7269 INFO, 7270 CASE WHEN INFO NOT in ('','.') 7271 THEN ';' 7272 ELSE '' 7273 END, 7274 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7275 ) 7276 """ 7277 sql_queries.append(sql_update_pztags) 7278 7279 # Add Query update for PZFlags for default 7280 if profile == default_profile: 7281 sql_update_pztags_default = f""" 7282 UPDATE {table_variants} 7283 SET INFO = concat( 7284 INFO, 7285 ';', 7286 '{pz_prefix}Tags={pztags_value}' 7287 ) 7288 """ 7289 sql_queries.append(sql_update_pztags_default) 7290 7291 log.info(f"""Profile '{profile}' - Prioritization... """) 7292 7293 if sql_queries: 7294 7295 for sql_query in sql_queries: 7296 log.debug( 7297 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7298 ) 7299 self.conn.execute(sql_query) 7300 7301 log.info(f"""Profile '{profile}' - Update... """) 7302 sql_query_update = f""" 7303 UPDATE {table_variants} 7304 SET INFO = 7305 concat( 7306 CASE 7307 WHEN INFO NOT IN ('','.') 7308 THEN concat(INFO, ';') 7309 ELSE '' 7310 END 7311 {sql_set_info_option} 7312 ) 7313 """ 7314 self.conn.execute(sql_query_update) 7315 7316 else: 7317 7318 log.warning(f"No profiles in parameters") 7319 7320 # Remove added columns 7321 for added_column in added_columns: 7322 self.drop_column(column=added_column) 7323 7324 # Explode INFOS fields into table fields 7325 if self.get_explode_infos(): 7326 self.explode_infos( 7327 prefix=self.get_explode_infos_prefix(), 7328 fields=self.get_explode_infos_fields(), 7329 force=True, 7330 ) 7331 7332 return True 7333 7334 ### 7335 # HGVS 7336 ### 7337 7338 def annotation_hgvs(self, threads: int = None) -> None: 7339 """ 7340 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7341 coordinates and alleles. 7342 7343 :param threads: The `threads` parameter is an optional integer that specifies the number of 7344 threads to use for parallel processing. If no value is provided, it will default to the number 7345 of threads obtained from the `get_threads()` method 7346 :type threads: int 7347 """ 7348 7349 # Function for each partition of the Dask Dataframe 7350 def partition_function(partition): 7351 """ 7352 The function `partition_function` applies the `annotation_hgvs_partition` function to 7353 each row of a DataFrame called `partition`. 7354 7355 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7356 to be processed 7357 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7358 the "partition" dataframe along the axis 1. 7359 """ 7360 return partition.apply(annotation_hgvs_partition, axis=1) 7361 7362 def annotation_hgvs_partition(row) -> str: 7363 """ 7364 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7365 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7366 7367 :param row: A dictionary-like object that contains the values for the following keys: 7368 :return: a string that contains the HGVS names associated with the given row of data. 7369 """ 7370 7371 chr = row["CHROM"] 7372 pos = row["POS"] 7373 ref = row["REF"] 7374 alt = row["ALT"] 7375 7376 # Find list of associated transcripts 7377 transcripts_list = list( 7378 polars_conn.execute( 7379 f""" 7380 SELECT transcript 7381 FROM refseq_df 7382 WHERE CHROM='{chr}' 7383 AND POS={pos} 7384 """ 7385 )["transcript"] 7386 ) 7387 7388 # Full HGVS annotation in list 7389 hgvs_full_list = [] 7390 7391 for transcript_name in transcripts_list: 7392 7393 # Transcript 7394 transcript = get_transcript( 7395 transcripts=transcripts, transcript_name=transcript_name 7396 ) 7397 # Exon 7398 if use_exon: 7399 exon = transcript.find_exon_number(pos) 7400 else: 7401 exon = None 7402 # Protein 7403 transcript_protein = None 7404 if use_protein or add_protein or full_format: 7405 transcripts_protein = list( 7406 polars_conn.execute( 7407 f""" 7408 SELECT protein 7409 FROM refseqlink_df 7410 WHERE transcript='{transcript_name}' 7411 LIMIT 1 7412 """ 7413 )["protein"] 7414 ) 7415 if len(transcripts_protein): 7416 transcript_protein = transcripts_protein[0] 7417 7418 # HGVS name 7419 hgvs_name = format_hgvs_name( 7420 chr, 7421 pos, 7422 ref, 7423 alt, 7424 genome=genome, 7425 transcript=transcript, 7426 transcript_protein=transcript_protein, 7427 exon=exon, 7428 use_gene=use_gene, 7429 use_protein=use_protein, 7430 full_format=full_format, 7431 use_version=use_version, 7432 codon_type=codon_type, 7433 ) 7434 hgvs_full_list.append(hgvs_name) 7435 if add_protein and not use_protein and not full_format: 7436 hgvs_name = format_hgvs_name( 7437 chr, 7438 pos, 7439 ref, 7440 alt, 7441 genome=genome, 7442 transcript=transcript, 7443 transcript_protein=transcript_protein, 7444 exon=exon, 7445 use_gene=use_gene, 7446 use_protein=True, 7447 full_format=False, 7448 use_version=use_version, 7449 codon_type=codon_type, 7450 ) 7451 hgvs_full_list.append(hgvs_name) 7452 7453 # Create liste of HGVS annotations 7454 hgvs_full = ",".join(hgvs_full_list) 7455 7456 return hgvs_full 7457 7458 # Polars connexion 7459 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7460 7461 # Config 7462 config = self.get_config() 7463 7464 # Databases 7465 # Genome 7466 databases_genomes_folders = ( 7467 config.get("folders", {}) 7468 .get("databases", {}) 7469 .get("genomes", DEFAULT_GENOME_FOLDER) 7470 ) 7471 databases_genome = ( 7472 config.get("folders", {}).get("databases", {}).get("genomes", "") 7473 ) 7474 # refseq database folder 7475 databases_refseq_folders = ( 7476 config.get("folders", {}) 7477 .get("databases", {}) 7478 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7479 ) 7480 # refseq 7481 databases_refseq = config.get("databases", {}).get("refSeq", None) 7482 # refSeqLink 7483 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7484 7485 # Param 7486 param = self.get_param() 7487 7488 # Quick HGVS 7489 if "hgvs_options" in param and param.get("hgvs_options", ""): 7490 log.info(f"Quick HGVS Annotation:") 7491 if not param.get("hgvs", None): 7492 param["hgvs"] = {} 7493 for option in param.get("hgvs_options", "").split(","): 7494 option_var_val = option.split("=") 7495 option_var = option_var_val[0] 7496 if len(option_var_val) > 1: 7497 option_val = option_var_val[1] 7498 else: 7499 option_val = "True" 7500 if option_val.upper() in ["TRUE"]: 7501 option_val = True 7502 elif option_val.upper() in ["FALSE"]: 7503 option_val = False 7504 log.info(f" {option_var}={option_val}") 7505 param["hgvs"][option_var] = option_val 7506 7507 # Check if HGVS annotation enabled 7508 if "hgvs" in param: 7509 log.info(f"HGVS Annotation... ") 7510 for hgvs_option in param.get("hgvs", {}): 7511 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7512 else: 7513 return 7514 7515 # HGVS Param 7516 param_hgvs = param.get("hgvs", {}) 7517 use_exon = param_hgvs.get("use_exon", False) 7518 use_gene = param_hgvs.get("use_gene", False) 7519 use_protein = param_hgvs.get("use_protein", False) 7520 add_protein = param_hgvs.get("add_protein", False) 7521 full_format = param_hgvs.get("full_format", False) 7522 use_version = param_hgvs.get("use_version", False) 7523 codon_type = param_hgvs.get("codon_type", "3") 7524 7525 # refSseq refSeqLink 7526 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7527 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7528 7529 # Assembly 7530 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7531 7532 # Genome 7533 genome_file = None 7534 if find_genome(databases_genome): 7535 genome_file = find_genome(databases_genome) 7536 else: 7537 genome_file = find_genome( 7538 genome_path=databases_genomes_folders, assembly=assembly 7539 ) 7540 log.debug("Genome: " + str(genome_file)) 7541 7542 # refSseq 7543 refseq_file = find_file_prefix( 7544 input_file=databases_refseq, 7545 prefix="ncbiRefSeq", 7546 folder=databases_refseq_folders, 7547 assembly=assembly, 7548 ) 7549 log.debug("refSeq: " + str(refseq_file)) 7550 7551 # refSeqLink 7552 refseqlink_file = find_file_prefix( 7553 input_file=databases_refseqlink, 7554 prefix="ncbiRefSeqLink", 7555 folder=databases_refseq_folders, 7556 assembly=assembly, 7557 ) 7558 log.debug("refSeqLink: " + str(refseqlink_file)) 7559 7560 # Threads 7561 if not threads: 7562 threads = self.get_threads() 7563 log.debug("Threads: " + str(threads)) 7564 7565 # Variables 7566 table_variants = self.get_table_variants(clause="update") 7567 7568 # Get variants SNV and InDel only 7569 query_variants = f""" 7570 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7571 FROM {table_variants} 7572 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7573 """ 7574 df_variants = self.get_query_to_df(query_variants) 7575 7576 # Added columns 7577 added_columns = [] 7578 7579 # Add hgvs column in variants table 7580 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7581 added_column = self.add_column( 7582 table_variants, hgvs_column_name, "STRING", default_value=None 7583 ) 7584 added_columns.append(added_column) 7585 7586 log.debug(f"refSeq loading...") 7587 # refSeq in duckDB 7588 refseq_table = get_refseq_table( 7589 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7590 ) 7591 # Loading all refSeq in Dataframe 7592 refseq_query = f""" 7593 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7594 FROM {refseq_table} 7595 JOIN df_variants ON ( 7596 {refseq_table}.chrom = df_variants.CHROM 7597 AND {refseq_table}.txStart<=df_variants.POS 7598 AND {refseq_table}.txEnd>=df_variants.POS 7599 ) 7600 """ 7601 refseq_df = self.conn.query(refseq_query).pl() 7602 7603 if refseqlink_file: 7604 log.debug(f"refSeqLink loading...") 7605 # refSeqLink in duckDB 7606 refseqlink_table = get_refseq_table( 7607 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7608 ) 7609 # Loading all refSeqLink in Dataframe 7610 protacc_column = "protAcc_with_ver" 7611 mrnaacc_column = "mrnaAcc_with_ver" 7612 refseqlink_query = f""" 7613 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7614 FROM {refseqlink_table} 7615 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7616 WHERE protAcc_without_ver IS NOT NULL 7617 """ 7618 # Polars Dataframe 7619 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7620 7621 # Read RefSeq transcripts into a python dict/model. 7622 log.debug(f"Transcripts loading...") 7623 with tempfile.TemporaryDirectory() as tmpdir: 7624 transcripts_query = f""" 7625 COPY ( 7626 SELECT {refseq_table}.* 7627 FROM {refseq_table} 7628 JOIN df_variants ON ( 7629 {refseq_table}.chrom=df_variants.CHROM 7630 AND {refseq_table}.txStart<=df_variants.POS 7631 AND {refseq_table}.txEnd>=df_variants.POS 7632 ) 7633 ) 7634 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7635 """ 7636 self.conn.query(transcripts_query) 7637 with open(f"{tmpdir}/transcript.tsv") as infile: 7638 transcripts = read_transcripts(infile) 7639 7640 # Polars connexion 7641 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7642 7643 log.debug("Genome loading...") 7644 # Read genome sequence using pyfaidx. 7645 genome = Fasta(genome_file) 7646 7647 log.debug("Start annotation HGVS...") 7648 7649 # Create 7650 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7651 ddf = dd.from_pandas(df_variants, npartitions=threads) 7652 7653 # Use dask.dataframe.apply() to apply function on each partition 7654 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7655 7656 # Convert Dask DataFrame to Pandas Dataframe 7657 df = ddf.compute() 7658 7659 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7660 with tempfile.TemporaryDirectory() as tmpdir: 7661 df_parquet = os.path.join(tmpdir, "df.parquet") 7662 df.to_parquet(df_parquet) 7663 7664 # Update hgvs column 7665 update_variant_query = f""" 7666 UPDATE {table_variants} 7667 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7668 FROM read_parquet('{df_parquet}') as df 7669 WHERE variants."#CHROM" = df.CHROM 7670 AND variants.POS = df.POS 7671 AND variants.REF = df.REF 7672 AND variants.ALT = df.ALT 7673 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7674 """ 7675 self.execute_query(update_variant_query) 7676 7677 # Update INFO column 7678 sql_query_update = f""" 7679 UPDATE {table_variants} 7680 SET INFO = 7681 concat( 7682 CASE 7683 WHEN INFO NOT IN ('','.') 7684 THEN concat(INFO, ';') 7685 ELSE '' 7686 END, 7687 'hgvs=', 7688 {hgvs_column_name} 7689 ) 7690 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7691 """ 7692 self.execute_query(sql_query_update) 7693 7694 # Add header 7695 HGVS_INFOS = { 7696 "hgvs": { 7697 "ID": "hgvs", 7698 "Number": ".", 7699 "Type": "String", 7700 "Description": f"HGVS annotatation with HOWARD", 7701 } 7702 } 7703 7704 for field in HGVS_INFOS: 7705 field_ID = HGVS_INFOS[field]["ID"] 7706 field_description = HGVS_INFOS[field]["Description"] 7707 self.get_header().infos[field_ID] = vcf.parser._Info( 7708 field_ID, 7709 HGVS_INFOS[field]["Number"], 7710 HGVS_INFOS[field]["Type"], 7711 field_description, 7712 "unknown", 7713 "unknown", 7714 code_type_map[HGVS_INFOS[field]["Type"]], 7715 ) 7716 7717 # Remove added columns 7718 for added_column in added_columns: 7719 self.drop_column(column=added_column) 7720 7721 ### 7722 # Calculation 7723 ### 7724 7725 def get_operations_help( 7726 self, operations_config_dict: dict = {}, operations_config_file: str = None 7727 ) -> list: 7728 7729 # Init 7730 operations_help = [] 7731 7732 # operations 7733 operations = self.get_config_json( 7734 name="calculations", 7735 config_dict=operations_config_dict, 7736 config_file=operations_config_file, 7737 ) 7738 for op in operations: 7739 op_name = operations[op].get("name", op).upper() 7740 op_description = operations[op].get("description", op_name) 7741 op_available = operations[op].get("available", False) 7742 if op_available: 7743 operations_help.append(f" {op_name}: {op_description}") 7744 7745 # Sort operations 7746 operations_help.sort() 7747 7748 # insert header 7749 operations_help.insert(0, "Available calculation operations:") 7750 7751 # Return 7752 return operations_help 7753 7754 def calculation( 7755 self, 7756 operations: dict = {}, 7757 operations_config_dict: dict = {}, 7758 operations_config_file: str = None, 7759 ) -> None: 7760 """ 7761 It takes a list of operations, and for each operation, it checks if it's a python or sql 7762 operation, and then calls the appropriate function 7763 7764 param json example: 7765 "calculation": { 7766 "NOMEN": { 7767 "options": { 7768 "hgvs_field": "hgvs" 7769 }, 7770 "middle" : null 7771 } 7772 """ 7773 7774 # Param 7775 param = self.get_param() 7776 7777 # operations config 7778 operations_config = self.get_config_json( 7779 name="calculations", 7780 config_dict=operations_config_dict, 7781 config_file=operations_config_file, 7782 ) 7783 7784 # Upper keys 7785 operations_config = {k.upper(): v for k, v in operations_config.items()} 7786 7787 # Calculations 7788 7789 # Operations from param 7790 operations = param.get("calculation", {}).get("calculations", operations) 7791 7792 # Quick calculation - add 7793 if param.get("calculations", None): 7794 calculations_list = [ 7795 value for value in param.get("calculations", "").split(",") 7796 ] 7797 log.info(f"Quick Calculations:") 7798 for calculation_key in calculations_list: 7799 log.info(f" {calculation_key}") 7800 for calculation_operation in calculations_list: 7801 if calculation_operation.upper() not in operations: 7802 operations[calculation_operation.upper()] = {} 7803 add_value_into_dict( 7804 dict_tree=param, 7805 sections=[ 7806 "calculation", 7807 "calculations", 7808 calculation_operation.upper(), 7809 ], 7810 value={}, 7811 ) 7812 7813 # Operations for calculation 7814 if not operations: 7815 operations = param.get("calculation", {}).get("calculations", {}) 7816 7817 if operations: 7818 log.info(f"Calculations...") 7819 7820 # For each operations 7821 for operation_name in operations: 7822 operation_name = operation_name.upper() 7823 if operation_name not in [""]: 7824 if operation_name in operations_config: 7825 log.info(f"Calculation '{operation_name}'") 7826 operation = operations_config[operation_name] 7827 operation_type = operation.get("type", "sql") 7828 if operation_type == "python": 7829 self.calculation_process_function( 7830 operation=operation, operation_name=operation_name 7831 ) 7832 elif operation_type == "sql": 7833 self.calculation_process_sql( 7834 operation=operation, operation_name=operation_name 7835 ) 7836 else: 7837 log.error( 7838 f"Operations config: Type '{operation_type}' NOT available" 7839 ) 7840 raise ValueError( 7841 f"Operations config: Type '{operation_type}' NOT available" 7842 ) 7843 else: 7844 log.error( 7845 f"Operations config: Calculation '{operation_name}' NOT available" 7846 ) 7847 raise ValueError( 7848 f"Operations config: Calculation '{operation_name}' NOT available" 7849 ) 7850 7851 # Explode INFOS fields into table fields 7852 if self.get_explode_infos(): 7853 self.explode_infos( 7854 prefix=self.get_explode_infos_prefix(), 7855 fields=self.get_explode_infos_fields(), 7856 force=True, 7857 ) 7858 7859 def calculation_process_sql( 7860 self, operation: dict, operation_name: str = "unknown" 7861 ) -> None: 7862 """ 7863 The `calculation_process_sql` function takes in a mathematical operation as a string and 7864 performs the operation, updating the specified table with the result. 7865 7866 :param operation: The `operation` parameter is a dictionary that contains information about the 7867 mathematical operation to be performed. It includes the following keys: 7868 :type operation: dict 7869 :param operation_name: The `operation_name` parameter is a string that represents the name of 7870 the mathematical operation being performed. It is used for logging and error handling purposes, 7871 defaults to unknown 7872 :type operation_name: str (optional) 7873 """ 7874 7875 # table variants 7876 table_variants = self.get_table_variants(clause="alter") 7877 7878 # Operation infos 7879 operation_name = operation.get("name", "unknown") 7880 log.debug(f"process sql {operation_name}") 7881 output_column_name = operation.get("output_column_name", operation_name) 7882 output_column_type = operation.get("output_column_type", "String") 7883 prefix = operation.get("explode_infos_prefix", "") 7884 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7885 output_column_description = operation.get( 7886 "output_column_description", f"{operation_name} operation" 7887 ) 7888 operation_query = operation.get("operation_query", None) 7889 if isinstance(operation_query, list): 7890 operation_query = " ".join(operation_query) 7891 operation_info_fields = operation.get("info_fields", []) 7892 operation_info_fields_check = operation.get("info_fields_check", False) 7893 operation_info = operation.get("operation_info", True) 7894 7895 if operation_query: 7896 7897 # Info fields check 7898 operation_info_fields_check_result = True 7899 if operation_info_fields_check: 7900 header_infos = self.get_header().infos 7901 for info_field in operation_info_fields: 7902 operation_info_fields_check_result = ( 7903 operation_info_fields_check_result 7904 and info_field in header_infos 7905 ) 7906 7907 # If info fields available 7908 if operation_info_fields_check_result: 7909 7910 # Added_columns 7911 added_columns = [] 7912 7913 # Create VCF header field 7914 vcf_reader = self.get_header() 7915 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7916 output_column_name, 7917 ".", 7918 output_column_type, 7919 output_column_description, 7920 "howard calculation", 7921 "0", 7922 self.code_type_map.get(output_column_type), 7923 ) 7924 7925 # Explode infos if needed 7926 log.debug(f"calculation_process_sql prefix {prefix}") 7927 added_columns += self.explode_infos( 7928 prefix=prefix, 7929 fields=[output_column_name] + operation_info_fields, 7930 force=True, 7931 ) 7932 7933 # Create column 7934 added_column = self.add_column( 7935 table_name=table_variants, 7936 column_name=prefix + output_column_name, 7937 column_type=output_column_type_sql, 7938 default_value="null", 7939 ) 7940 added_columns.append(added_column) 7941 7942 # Operation calculation 7943 try: 7944 7945 # Query to update calculation column 7946 sql_update = f""" 7947 UPDATE {table_variants} 7948 SET "{prefix}{output_column_name}" = ({operation_query}) 7949 """ 7950 self.conn.execute(sql_update) 7951 7952 # Add to INFO 7953 if operation_info: 7954 sql_update_info = f""" 7955 UPDATE {table_variants} 7956 SET "INFO" = 7957 concat( 7958 CASE 7959 WHEN "INFO" IS NOT NULL 7960 THEN concat("INFO", ';') 7961 ELSE '' 7962 END, 7963 '{output_column_name}=', 7964 "{prefix}{output_column_name}" 7965 ) 7966 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7967 """ 7968 self.conn.execute(sql_update_info) 7969 7970 except: 7971 log.error( 7972 f"Operations config: Calculation '{operation_name}' query failed" 7973 ) 7974 raise ValueError( 7975 f"Operations config: Calculation '{operation_name}' query failed" 7976 ) 7977 7978 # Remove added columns 7979 for added_column in added_columns: 7980 log.debug(f"added_column: {added_column}") 7981 self.drop_column(column=added_column) 7982 7983 else: 7984 log.error( 7985 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7986 ) 7987 raise ValueError( 7988 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7989 ) 7990 7991 else: 7992 log.error( 7993 f"Operations config: Calculation '{operation_name}' query NOT defined" 7994 ) 7995 raise ValueError( 7996 f"Operations config: Calculation '{operation_name}' query NOT defined" 7997 ) 7998 7999 def calculation_process_function( 8000 self, operation: dict, operation_name: str = "unknown" 8001 ) -> None: 8002 """ 8003 The `calculation_process_function` takes in an operation dictionary and performs the specified 8004 function with the given parameters. 8005 8006 :param operation: The `operation` parameter is a dictionary that contains information about the 8007 operation to be performed. It has the following keys: 8008 :type operation: dict 8009 :param operation_name: The `operation_name` parameter is a string that represents the name of 8010 the operation being performed. It is used for logging purposes, defaults to unknown 8011 :type operation_name: str (optional) 8012 """ 8013 8014 operation_name = operation["name"] 8015 log.debug(f"process sql {operation_name}") 8016 function_name = operation["function_name"] 8017 function_params = operation["function_params"] 8018 getattr(self, function_name)(*function_params) 8019 8020 def calculation_variant_id(self) -> None: 8021 """ 8022 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8023 updates the INFO field of a variants table with the variant ID. 8024 """ 8025 8026 # variant_id annotation field 8027 variant_id_tag = self.get_variant_id_column() 8028 added_columns = [variant_id_tag] 8029 8030 # variant_id hgvs tags" 8031 vcf_infos_tags = { 8032 variant_id_tag: "howard variant ID annotation", 8033 } 8034 8035 # Variants table 8036 table_variants = self.get_table_variants() 8037 8038 # Header 8039 vcf_reader = self.get_header() 8040 8041 # Add variant_id to header 8042 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8043 variant_id_tag, 8044 ".", 8045 "String", 8046 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8047 "howard calculation", 8048 "0", 8049 self.code_type_map.get("String"), 8050 ) 8051 8052 # Update 8053 sql_update = f""" 8054 UPDATE {table_variants} 8055 SET "INFO" = 8056 concat( 8057 CASE 8058 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8059 THEN '' 8060 ELSE concat("INFO", ';') 8061 END, 8062 '{variant_id_tag}=', 8063 "{variant_id_tag}" 8064 ) 8065 """ 8066 self.conn.execute(sql_update) 8067 8068 # Remove added columns 8069 for added_column in added_columns: 8070 self.drop_column(column=added_column) 8071 8072 def calculation_extract_snpeff_hgvs( 8073 self, 8074 snpeff_hgvs: str = "snpeff_hgvs", 8075 snpeff_field: str = "ANN", 8076 ) -> None: 8077 """ 8078 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8079 annotation field in a VCF file and adds them as a new column in the variants table. 8080 8081 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8082 function is used to specify the name of the column that will store the HGVS nomenclatures 8083 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8084 snpeff_hgvs 8085 :type snpeff_hgvs: str (optional) 8086 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8087 function represents the field in the VCF file that contains SnpEff annotations. This field is 8088 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8089 to ANN 8090 :type snpeff_field: str (optional) 8091 """ 8092 8093 # Snpeff hgvs tags 8094 vcf_infos_tags = { 8095 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8096 } 8097 8098 # Prefix 8099 prefix = self.get_explode_infos_prefix() 8100 if prefix: 8101 prefix = "INFO/" 8102 8103 # snpEff fields 8104 speff_ann_infos = prefix + snpeff_field 8105 speff_hgvs_infos = prefix + snpeff_hgvs 8106 8107 # Variants table 8108 table_variants = self.get_table_variants() 8109 8110 # Header 8111 vcf_reader = self.get_header() 8112 8113 # Add columns 8114 added_columns = [] 8115 8116 # Explode HGVS field in column 8117 added_columns += self.explode_infos(fields=[snpeff_field]) 8118 8119 if snpeff_field in vcf_reader.infos: 8120 8121 log.debug(vcf_reader.infos[snpeff_field]) 8122 8123 # Extract ANN header 8124 ann_description = vcf_reader.infos[snpeff_field].desc 8125 pattern = r"'(.+?)'" 8126 match = re.search(pattern, ann_description) 8127 if match: 8128 ann_header_match = match.group(1).split(" | ") 8129 ann_header_desc = {} 8130 for i in range(len(ann_header_match)): 8131 ann_header_info = "".join( 8132 char for char in ann_header_match[i] if char.isalnum() 8133 ) 8134 ann_header_desc[ann_header_info] = ann_header_match[i] 8135 if not ann_header_desc: 8136 raise ValueError("Invalid header description format") 8137 else: 8138 raise ValueError("Invalid header description format") 8139 8140 # Create variant id 8141 variant_id_column = self.get_variant_id_column() 8142 added_columns += [variant_id_column] 8143 8144 # Create dataframe 8145 dataframe_snpeff_hgvs = self.get_query_to_df( 8146 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8147 ) 8148 8149 # Create main NOMEN column 8150 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8151 speff_ann_infos 8152 ].apply( 8153 lambda x: extract_snpeff_hgvs( 8154 str(x), header=list(ann_header_desc.values()) 8155 ) 8156 ) 8157 8158 # Add snpeff_hgvs to header 8159 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8160 snpeff_hgvs, 8161 ".", 8162 "String", 8163 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8164 "howard calculation", 8165 "0", 8166 self.code_type_map.get("String"), 8167 ) 8168 8169 # Update 8170 sql_update = f""" 8171 UPDATE variants 8172 SET "INFO" = 8173 concat( 8174 CASE 8175 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8176 THEN '' 8177 ELSE concat("INFO", ';') 8178 END, 8179 CASE 8180 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8181 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8182 THEN concat( 8183 '{snpeff_hgvs}=', 8184 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8185 ) 8186 ELSE '' 8187 END 8188 ) 8189 FROM dataframe_snpeff_hgvs 8190 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8191 8192 """ 8193 self.conn.execute(sql_update) 8194 8195 # Delete dataframe 8196 del dataframe_snpeff_hgvs 8197 gc.collect() 8198 8199 else: 8200 8201 log.warning( 8202 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8203 ) 8204 8205 # Remove added columns 8206 for added_column in added_columns: 8207 self.drop_column(column=added_column) 8208 8209 def calculation_snpeff_ann_explode( 8210 self, 8211 uniquify: bool = True, 8212 output_format: str = "fields", 8213 output_prefix: str = "snpeff_", 8214 snpeff_field: str = "ANN", 8215 ) -> None: 8216 """ 8217 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8218 exploding the HGVS field and updating variant information accordingly. 8219 8220 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8221 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8222 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8223 defaults to True 8224 :type uniquify: bool (optional) 8225 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8226 function specifies the format in which the output annotations will be generated. It has a 8227 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8228 format, defaults to fields 8229 :type output_format: str (optional) 8230 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8231 method is used to specify the prefix that will be added to the output annotations generated 8232 during the calculation process. This prefix helps to differentiate the newly added annotations 8233 from existing ones in the output data. By default, the, defaults to ANN_ 8234 :type output_prefix: str (optional) 8235 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8236 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8237 field will be processed to explode the HGVS annotations and update the variant information 8238 accordingly, defaults to ANN 8239 :type snpeff_field: str (optional) 8240 """ 8241 8242 # SnpEff annotation field 8243 snpeff_hgvs = "snpeff_ann_explode" 8244 8245 # Snpeff hgvs tags 8246 vcf_infos_tags = { 8247 snpeff_hgvs: "Explode snpEff annotations", 8248 } 8249 8250 # Prefix 8251 prefix = self.get_explode_infos_prefix() 8252 if prefix: 8253 prefix = "INFO/" 8254 8255 # snpEff fields 8256 speff_ann_infos = prefix + snpeff_field 8257 speff_hgvs_infos = prefix + snpeff_hgvs 8258 8259 # Variants table 8260 table_variants = self.get_table_variants() 8261 8262 # Header 8263 vcf_reader = self.get_header() 8264 8265 # Add columns 8266 added_columns = [] 8267 8268 # Explode HGVS field in column 8269 added_columns += self.explode_infos(fields=[snpeff_field]) 8270 log.debug(f"snpeff_field={snpeff_field}") 8271 log.debug(f"added_columns={added_columns}") 8272 8273 if snpeff_field in vcf_reader.infos: 8274 8275 # Extract ANN header 8276 ann_description = vcf_reader.infos[snpeff_field].desc 8277 pattern = r"'(.+?)'" 8278 match = re.search(pattern, ann_description) 8279 if match: 8280 ann_header_match = match.group(1).split(" | ") 8281 ann_header = [] 8282 ann_header_desc = {} 8283 for i in range(len(ann_header_match)): 8284 ann_header_info = "".join( 8285 char for char in ann_header_match[i] if char.isalnum() 8286 ) 8287 ann_header.append(ann_header_info) 8288 ann_header_desc[ann_header_info] = ann_header_match[i] 8289 if not ann_header_desc: 8290 raise ValueError("Invalid header description format") 8291 else: 8292 raise ValueError("Invalid header description format") 8293 8294 # Create variant id 8295 variant_id_column = self.get_variant_id_column() 8296 added_columns += [variant_id_column] 8297 8298 # Create dataframe 8299 dataframe_snpeff_hgvs = self.get_query_to_df( 8300 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8301 ) 8302 8303 # Create snpEff columns 8304 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8305 speff_ann_infos 8306 ].apply( 8307 lambda x: explode_snpeff_ann( 8308 str(x), 8309 uniquify=uniquify, 8310 output_format=output_format, 8311 prefix=output_prefix, 8312 header=list(ann_header_desc.values()), 8313 ) 8314 ) 8315 8316 # Header 8317 ann_annotations_prefix = "" 8318 if output_format.upper() in ["JSON"]: 8319 ann_annotations_prefix = f"{output_prefix}=" 8320 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8321 output_prefix, 8322 ".", 8323 "String", 8324 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8325 + " - JSON format", 8326 "howard calculation", 8327 "0", 8328 self.code_type_map.get("String"), 8329 ) 8330 else: 8331 for ann_annotation in ann_header: 8332 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8333 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8334 ann_annotation_id, 8335 ".", 8336 "String", 8337 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8338 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8339 "howard calculation", 8340 "0", 8341 self.code_type_map.get("String"), 8342 ) 8343 8344 # Update 8345 sql_update = f""" 8346 UPDATE variants 8347 SET "INFO" = 8348 concat( 8349 CASE 8350 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8351 THEN '' 8352 ELSE concat("INFO", ';') 8353 END, 8354 CASE 8355 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8356 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8357 THEN concat( 8358 '{ann_annotations_prefix}', 8359 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8360 ) 8361 ELSE '' 8362 END 8363 ) 8364 FROM dataframe_snpeff_hgvs 8365 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8366 8367 """ 8368 self.conn.execute(sql_update) 8369 8370 # Delete dataframe 8371 del dataframe_snpeff_hgvs 8372 gc.collect() 8373 8374 else: 8375 8376 log.warning( 8377 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8378 ) 8379 8380 # Remove added columns 8381 for added_column in added_columns: 8382 self.drop_column(column=added_column) 8383 8384 def calculation_extract_nomen(self) -> None: 8385 """ 8386 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8387 """ 8388 8389 # NOMEN field 8390 field_nomen_dict = "NOMEN_DICT" 8391 8392 # NOMEN structure 8393 nomen_dict = { 8394 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8395 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8396 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8397 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8398 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8399 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8400 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8401 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8402 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8403 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8404 } 8405 8406 # Param 8407 param = self.get_param() 8408 8409 # Prefix 8410 prefix = self.get_explode_infos_prefix() 8411 8412 # Header 8413 vcf_reader = self.get_header() 8414 8415 # Get HGVS field 8416 hgvs_field = ( 8417 param.get("calculation", {}) 8418 .get("calculations", {}) 8419 .get("NOMEN", {}) 8420 .get("options", {}) 8421 .get("hgvs_field", "hgvs") 8422 ) 8423 8424 # Get transcripts 8425 transcripts_file = ( 8426 param.get("calculation", {}) 8427 .get("calculations", {}) 8428 .get("NOMEN", {}) 8429 .get("options", {}) 8430 .get("transcripts", None) 8431 ) 8432 transcripts_file = full_path(transcripts_file) 8433 transcripts = [] 8434 if transcripts_file: 8435 if os.path.exists(transcripts_file): 8436 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8437 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8438 else: 8439 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8440 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8441 8442 # Added columns 8443 added_columns = [] 8444 8445 # Explode HGVS field in column 8446 added_columns += self.explode_infos(fields=[hgvs_field]) 8447 8448 # extra infos 8449 extra_infos = self.get_extra_infos() 8450 extra_field = prefix + hgvs_field 8451 8452 if extra_field in extra_infos: 8453 8454 # Create dataframe 8455 dataframe_hgvs = self.get_query_to_df( 8456 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8457 ) 8458 8459 # Create main NOMEN column 8460 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8461 lambda x: find_nomen(str(x), transcripts=transcripts) 8462 ) 8463 8464 # Explode NOMEN Structure and create SQL set for update 8465 sql_nomen_fields = [] 8466 for nomen_field in nomen_dict: 8467 8468 # Explode each field into a column 8469 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8470 lambda x: dict(x).get(nomen_field, "") 8471 ) 8472 8473 # Create VCF header field 8474 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8475 nomen_field, 8476 ".", 8477 "String", 8478 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8479 "howard calculation", 8480 "0", 8481 self.code_type_map.get("String"), 8482 ) 8483 sql_nomen_fields.append( 8484 f""" 8485 CASE 8486 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8487 THEN concat( 8488 ';{nomen_field}=', 8489 dataframe_hgvs."{nomen_field}" 8490 ) 8491 ELSE '' 8492 END 8493 """ 8494 ) 8495 8496 # SQL set for update 8497 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8498 8499 # Update 8500 sql_update = f""" 8501 UPDATE variants 8502 SET "INFO" = 8503 concat( 8504 CASE 8505 WHEN "INFO" IS NULL 8506 THEN '' 8507 ELSE "INFO" 8508 END, 8509 {sql_nomen_fields_set} 8510 ) 8511 FROM dataframe_hgvs 8512 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8513 AND variants."POS" = dataframe_hgvs."POS" 8514 AND variants."REF" = dataframe_hgvs."REF" 8515 AND variants."ALT" = dataframe_hgvs."ALT" 8516 """ 8517 self.conn.execute(sql_update) 8518 8519 # Delete dataframe 8520 del dataframe_hgvs 8521 gc.collect() 8522 8523 # Remove added columns 8524 for added_column in added_columns: 8525 self.drop_column(column=added_column) 8526 8527 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8528 """ 8529 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8530 pipeline/sample for a variant and updates the variant information in a VCF file. 8531 8532 :param tag: The `tag` parameter is a string that represents the annotation field for the 8533 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8534 VCF header and to update the corresponding field in the variants table, defaults to 8535 findbypipeline 8536 :type tag: str (optional) 8537 """ 8538 8539 # if FORMAT and samples 8540 if ( 8541 "FORMAT" in self.get_header_columns_as_list() 8542 and self.get_header_sample_list() 8543 ): 8544 8545 # findbypipeline annotation field 8546 findbypipeline_tag = tag 8547 8548 # VCF infos tags 8549 vcf_infos_tags = { 8550 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8551 } 8552 8553 # Prefix 8554 prefix = self.get_explode_infos_prefix() 8555 8556 # Field 8557 findbypipeline_infos = prefix + findbypipeline_tag 8558 8559 # Variants table 8560 table_variants = self.get_table_variants() 8561 8562 # Header 8563 vcf_reader = self.get_header() 8564 8565 # Create variant id 8566 variant_id_column = self.get_variant_id_column() 8567 added_columns = [variant_id_column] 8568 8569 # variant_id, FORMAT and samples 8570 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8571 self.get_header_sample_list() 8572 ) 8573 8574 # Create dataframe 8575 dataframe_findbypipeline = self.get_query_to_df( 8576 f""" SELECT {samples_fields} FROM {table_variants} """ 8577 ) 8578 8579 # Create findbypipeline column 8580 dataframe_findbypipeline[findbypipeline_infos] = ( 8581 dataframe_findbypipeline.apply( 8582 lambda row: findbypipeline( 8583 row, samples=self.get_header_sample_list() 8584 ), 8585 axis=1, 8586 ) 8587 ) 8588 8589 # Add snpeff_hgvs to header 8590 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8591 findbypipeline_tag, 8592 ".", 8593 "String", 8594 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8595 "howard calculation", 8596 "0", 8597 self.code_type_map.get("String"), 8598 ) 8599 8600 # Update 8601 sql_update = f""" 8602 UPDATE variants 8603 SET "INFO" = 8604 concat( 8605 CASE 8606 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8607 THEN '' 8608 ELSE concat("INFO", ';') 8609 END, 8610 CASE 8611 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8612 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8613 THEN concat( 8614 '{findbypipeline_tag}=', 8615 dataframe_findbypipeline."{findbypipeline_infos}" 8616 ) 8617 ELSE '' 8618 END 8619 ) 8620 FROM dataframe_findbypipeline 8621 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8622 """ 8623 self.conn.execute(sql_update) 8624 8625 # Remove added columns 8626 for added_column in added_columns: 8627 self.drop_column(column=added_column) 8628 8629 # Delete dataframe 8630 del dataframe_findbypipeline 8631 gc.collect() 8632 8633 def calculation_genotype_concordance(self) -> None: 8634 """ 8635 The function `calculation_genotype_concordance` calculates the genotype concordance for 8636 multi-caller VCF files and updates the variant information in the database. 8637 """ 8638 8639 # if FORMAT and samples 8640 if ( 8641 "FORMAT" in self.get_header_columns_as_list() 8642 and self.get_header_sample_list() 8643 ): 8644 8645 # genotypeconcordance annotation field 8646 genotypeconcordance_tag = "genotypeconcordance" 8647 8648 # VCF infos tags 8649 vcf_infos_tags = { 8650 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8651 } 8652 8653 # Prefix 8654 prefix = self.get_explode_infos_prefix() 8655 8656 # Field 8657 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8658 8659 # Variants table 8660 table_variants = self.get_table_variants() 8661 8662 # Header 8663 vcf_reader = self.get_header() 8664 8665 # Create variant id 8666 variant_id_column = self.get_variant_id_column() 8667 added_columns = [variant_id_column] 8668 8669 # variant_id, FORMAT and samples 8670 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8671 self.get_header_sample_list() 8672 ) 8673 8674 # Create dataframe 8675 dataframe_genotypeconcordance = self.get_query_to_df( 8676 f""" SELECT {samples_fields} FROM {table_variants} """ 8677 ) 8678 8679 # Create genotypeconcordance column 8680 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8681 dataframe_genotypeconcordance.apply( 8682 lambda row: genotypeconcordance( 8683 row, samples=self.get_header_sample_list() 8684 ), 8685 axis=1, 8686 ) 8687 ) 8688 8689 # Add genotypeconcordance to header 8690 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8691 genotypeconcordance_tag, 8692 ".", 8693 "String", 8694 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8695 "howard calculation", 8696 "0", 8697 self.code_type_map.get("String"), 8698 ) 8699 8700 # Update 8701 sql_update = f""" 8702 UPDATE variants 8703 SET "INFO" = 8704 concat( 8705 CASE 8706 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8707 THEN '' 8708 ELSE concat("INFO", ';') 8709 END, 8710 CASE 8711 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8712 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8713 THEN concat( 8714 '{genotypeconcordance_tag}=', 8715 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8716 ) 8717 ELSE '' 8718 END 8719 ) 8720 FROM dataframe_genotypeconcordance 8721 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8722 """ 8723 self.conn.execute(sql_update) 8724 8725 # Remove added columns 8726 for added_column in added_columns: 8727 self.drop_column(column=added_column) 8728 8729 # Delete dataframe 8730 del dataframe_genotypeconcordance 8731 gc.collect() 8732 8733 def calculation_barcode(self, tag: str = "barcode") -> None: 8734 """ 8735 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8736 updates the INFO field in the file with the calculated barcode values. 8737 8738 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8739 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8740 the default tag name is set to "barcode", defaults to barcode 8741 :type tag: str (optional) 8742 """ 8743 8744 # if FORMAT and samples 8745 if ( 8746 "FORMAT" in self.get_header_columns_as_list() 8747 and self.get_header_sample_list() 8748 ): 8749 8750 # barcode annotation field 8751 if not tag: 8752 tag = "barcode" 8753 8754 # VCF infos tags 8755 vcf_infos_tags = { 8756 tag: "barcode calculation (VaRank)", 8757 } 8758 8759 # Prefix 8760 prefix = self.get_explode_infos_prefix() 8761 8762 # Field 8763 barcode_infos = prefix + tag 8764 8765 # Variants table 8766 table_variants = self.get_table_variants() 8767 8768 # Header 8769 vcf_reader = self.get_header() 8770 8771 # Create variant id 8772 variant_id_column = self.get_variant_id_column() 8773 added_columns = [variant_id_column] 8774 8775 # variant_id, FORMAT and samples 8776 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8777 self.get_header_sample_list() 8778 ) 8779 8780 # Create dataframe 8781 dataframe_barcode = self.get_query_to_df( 8782 f""" SELECT {samples_fields} FROM {table_variants} """ 8783 ) 8784 8785 # Create barcode column 8786 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8787 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8788 ) 8789 8790 # Add barcode to header 8791 vcf_reader.infos[tag] = vcf.parser._Info( 8792 tag, 8793 ".", 8794 "String", 8795 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8796 "howard calculation", 8797 "0", 8798 self.code_type_map.get("String"), 8799 ) 8800 8801 # Update 8802 sql_update = f""" 8803 UPDATE {table_variants} 8804 SET "INFO" = 8805 concat( 8806 CASE 8807 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8808 THEN '' 8809 ELSE concat("INFO", ';') 8810 END, 8811 CASE 8812 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8813 AND dataframe_barcode."{barcode_infos}" NOT NULL 8814 THEN concat( 8815 '{tag}=', 8816 dataframe_barcode."{barcode_infos}" 8817 ) 8818 ELSE '' 8819 END 8820 ) 8821 FROM dataframe_barcode 8822 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8823 """ 8824 self.conn.execute(sql_update) 8825 8826 # Remove added columns 8827 for added_column in added_columns: 8828 self.drop_column(column=added_column) 8829 8830 # Delete dataframe 8831 del dataframe_barcode 8832 gc.collect() 8833 8834 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8835 """ 8836 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8837 and updates the INFO field in the file with the calculated barcode values. 8838 8839 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8840 the barcode tag that will be added to the VCF file during the calculation process. If no value 8841 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8842 :type tag: str (optional) 8843 """ 8844 8845 # if FORMAT and samples 8846 if ( 8847 "FORMAT" in self.get_header_columns_as_list() 8848 and self.get_header_sample_list() 8849 ): 8850 8851 # barcode annotation field 8852 if not tag: 8853 tag = "BCF" 8854 8855 # VCF infos tags 8856 vcf_infos_tags = { 8857 tag: "barcode family calculation", 8858 f"{tag}S": "barcode family samples", 8859 } 8860 8861 # Param 8862 param = self.get_param() 8863 log.debug(f"param={param}") 8864 8865 # Prefix 8866 prefix = self.get_explode_infos_prefix() 8867 8868 # PED param 8869 ped = ( 8870 param.get("calculation", {}) 8871 .get("calculations", {}) 8872 .get("BARCODEFAMILY", {}) 8873 .get("family_pedigree", None) 8874 ) 8875 log.debug(f"ped={ped}") 8876 8877 # Load PED 8878 if ped: 8879 8880 # Pedigree is a file 8881 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8882 log.debug("Pedigree is file") 8883 with open(full_path(ped)) as ped: 8884 ped = json.load(ped) 8885 8886 # Pedigree is a string 8887 elif isinstance(ped, str): 8888 log.debug("Pedigree is str") 8889 try: 8890 ped = json.loads(ped) 8891 log.debug("Pedigree is json str") 8892 except ValueError as e: 8893 ped_samples = ped.split(",") 8894 ped = {} 8895 for ped_sample in ped_samples: 8896 ped[ped_sample] = ped_sample 8897 8898 # Pedigree is a dict 8899 elif isinstance(ped, dict): 8900 log.debug("Pedigree is dict") 8901 8902 # Pedigree is not well formatted 8903 else: 8904 msg_error = "Pedigree not well formatted" 8905 log.error(msg_error) 8906 raise ValueError(msg_error) 8907 8908 # Construct list 8909 ped_samples = list(ped.values()) 8910 8911 else: 8912 log.debug("Pedigree not defined. Take all samples") 8913 ped_samples = self.get_header_sample_list() 8914 ped = {} 8915 for ped_sample in ped_samples: 8916 ped[ped_sample] = ped_sample 8917 8918 # Check pedigree 8919 if not ped or len(ped) == 0: 8920 msg_error = f"Error in pedigree: samples {ped_samples}" 8921 log.error(msg_error) 8922 raise ValueError(msg_error) 8923 8924 # Log 8925 log.info( 8926 "Calculation 'BARCODEFAMILY' - Samples: " 8927 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8928 ) 8929 log.debug(f"ped_samples={ped_samples}") 8930 8931 # Field 8932 barcode_infos = prefix + tag 8933 8934 # Variants table 8935 table_variants = self.get_table_variants() 8936 8937 # Header 8938 vcf_reader = self.get_header() 8939 8940 # Create variant id 8941 variant_id_column = self.get_variant_id_column() 8942 added_columns = [variant_id_column] 8943 8944 # variant_id, FORMAT and samples 8945 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8946 ped_samples 8947 ) 8948 8949 # Create dataframe 8950 dataframe_barcode = self.get_query_to_df( 8951 f""" SELECT {samples_fields} FROM {table_variants} """ 8952 ) 8953 8954 # Create barcode column 8955 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8956 lambda row: barcode(row, samples=ped_samples), axis=1 8957 ) 8958 8959 # Add barcode family to header 8960 # Add vaf_normalization to header 8961 vcf_reader.formats[tag] = vcf.parser._Format( 8962 id=tag, 8963 num=".", 8964 type="String", 8965 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8966 type_code=self.code_type_map.get("String"), 8967 ) 8968 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8969 id=f"{tag}S", 8970 num=".", 8971 type="String", 8972 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8973 type_code=self.code_type_map.get("String"), 8974 ) 8975 8976 # Update 8977 # for sample in ped_samples: 8978 sql_update_set = [] 8979 for sample in self.get_header_sample_list() + ["FORMAT"]: 8980 if sample in ped_samples: 8981 value = f'dataframe_barcode."{barcode_infos}"' 8982 value_samples = "'" + ",".join(ped_samples) + "'" 8983 elif sample == "FORMAT": 8984 value = f"'{tag}'" 8985 value_samples = f"'{tag}S'" 8986 else: 8987 value = "'.'" 8988 value_samples = "'.'" 8989 format_regex = r"[a-zA-Z0-9\s]" 8990 sql_update_set.append( 8991 f""" 8992 "{sample}" = 8993 concat( 8994 CASE 8995 WHEN {table_variants}."{sample}" = './.' 8996 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8997 ELSE {table_variants}."{sample}" 8998 END, 8999 ':', 9000 {value}, 9001 ':', 9002 {value_samples} 9003 ) 9004 """ 9005 ) 9006 9007 sql_update_set_join = ", ".join(sql_update_set) 9008 sql_update = f""" 9009 UPDATE {table_variants} 9010 SET {sql_update_set_join} 9011 FROM dataframe_barcode 9012 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9013 """ 9014 self.conn.execute(sql_update) 9015 9016 # Remove added columns 9017 for added_column in added_columns: 9018 self.drop_column(column=added_column) 9019 9020 # Delete dataframe 9021 del dataframe_barcode 9022 gc.collect() 9023 9024 def calculation_trio(self) -> None: 9025 """ 9026 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9027 information to the INFO field of each variant. 9028 """ 9029 9030 # if FORMAT and samples 9031 if ( 9032 "FORMAT" in self.get_header_columns_as_list() 9033 and self.get_header_sample_list() 9034 ): 9035 9036 # trio annotation field 9037 trio_tag = "trio" 9038 9039 # VCF infos tags 9040 vcf_infos_tags = { 9041 "trio": "trio calculation", 9042 } 9043 9044 # Param 9045 param = self.get_param() 9046 9047 # Prefix 9048 prefix = self.get_explode_infos_prefix() 9049 9050 # Trio param 9051 trio_ped = ( 9052 param.get("calculation", {}) 9053 .get("calculations", {}) 9054 .get("TRIO", {}) 9055 .get("trio_pedigree", None) 9056 ) 9057 9058 # Load trio 9059 if trio_ped: 9060 9061 # Trio pedigree is a file 9062 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9063 log.debug("TRIO pedigree is file") 9064 with open(full_path(trio_ped)) as trio_ped: 9065 trio_ped = json.load(trio_ped) 9066 9067 # Trio pedigree is a string 9068 elif isinstance(trio_ped, str): 9069 log.debug("TRIO pedigree is str") 9070 try: 9071 trio_ped = json.loads(trio_ped) 9072 log.debug("TRIO pedigree is json str") 9073 except ValueError as e: 9074 trio_samples = trio_ped.split(",") 9075 if len(trio_samples) == 3: 9076 trio_ped = { 9077 "father": trio_samples[0], 9078 "mother": trio_samples[1], 9079 "child": trio_samples[2], 9080 } 9081 log.debug("TRIO pedigree is list str") 9082 else: 9083 msg_error = "TRIO pedigree not well formatted" 9084 log.error(msg_error) 9085 raise ValueError(msg_error) 9086 9087 # Trio pedigree is a dict 9088 elif isinstance(trio_ped, dict): 9089 log.debug("TRIO pedigree is dict") 9090 9091 # Trio pedigree is not well formatted 9092 else: 9093 msg_error = "TRIO pedigree not well formatted" 9094 log.error(msg_error) 9095 raise ValueError(msg_error) 9096 9097 # Construct trio list 9098 trio_samples = [ 9099 trio_ped.get("father", ""), 9100 trio_ped.get("mother", ""), 9101 trio_ped.get("child", ""), 9102 ] 9103 9104 else: 9105 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9106 samples_list = self.get_header_sample_list() 9107 if len(samples_list) >= 3: 9108 trio_samples = self.get_header_sample_list()[0:3] 9109 trio_ped = { 9110 "father": trio_samples[0], 9111 "mother": trio_samples[1], 9112 "child": trio_samples[2], 9113 } 9114 else: 9115 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9116 log.error(msg_error) 9117 raise ValueError(msg_error) 9118 9119 # Check trio pedigree 9120 if not trio_ped or len(trio_ped) != 3: 9121 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9122 log.error(msg_error) 9123 raise ValueError(msg_error) 9124 9125 # Log 9126 log.info( 9127 f"Calculation 'TRIO' - Samples: " 9128 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9129 ) 9130 9131 # Field 9132 trio_infos = prefix + trio_tag 9133 9134 # Variants table 9135 table_variants = self.get_table_variants() 9136 9137 # Header 9138 vcf_reader = self.get_header() 9139 9140 # Create variant id 9141 variant_id_column = self.get_variant_id_column() 9142 added_columns = [variant_id_column] 9143 9144 # variant_id, FORMAT and samples 9145 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9146 self.get_header_sample_list() 9147 ) 9148 9149 # Create dataframe 9150 dataframe_trio = self.get_query_to_df( 9151 f""" SELECT {samples_fields} FROM {table_variants} """ 9152 ) 9153 9154 # Create trio column 9155 dataframe_trio[trio_infos] = dataframe_trio.apply( 9156 lambda row: trio(row, samples=trio_samples), axis=1 9157 ) 9158 9159 # Add trio to header 9160 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9161 trio_tag, 9162 ".", 9163 "String", 9164 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9165 "howard calculation", 9166 "0", 9167 self.code_type_map.get("String"), 9168 ) 9169 9170 # Update 9171 sql_update = f""" 9172 UPDATE {table_variants} 9173 SET "INFO" = 9174 concat( 9175 CASE 9176 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9177 THEN '' 9178 ELSE concat("INFO", ';') 9179 END, 9180 CASE 9181 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9182 AND dataframe_trio."{trio_infos}" NOT NULL 9183 THEN concat( 9184 '{trio_tag}=', 9185 dataframe_trio."{trio_infos}" 9186 ) 9187 ELSE '' 9188 END 9189 ) 9190 FROM dataframe_trio 9191 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9192 """ 9193 self.conn.execute(sql_update) 9194 9195 # Remove added columns 9196 for added_column in added_columns: 9197 self.drop_column(column=added_column) 9198 9199 # Delete dataframe 9200 del dataframe_trio 9201 gc.collect() 9202 9203 def calculation_vaf_normalization(self) -> None: 9204 """ 9205 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9206 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9207 :return: The function does not return anything. 9208 """ 9209 9210 # if FORMAT and samples 9211 if ( 9212 "FORMAT" in self.get_header_columns_as_list() 9213 and self.get_header_sample_list() 9214 ): 9215 9216 # vaf_normalization annotation field 9217 vaf_normalization_tag = "VAF" 9218 9219 # VCF infos tags 9220 vcf_infos_tags = { 9221 "VAF": "VAF Variant Frequency", 9222 } 9223 9224 # Prefix 9225 prefix = self.get_explode_infos_prefix() 9226 9227 # Variants table 9228 table_variants = self.get_table_variants() 9229 9230 # Header 9231 vcf_reader = self.get_header() 9232 9233 # Do not calculate if VAF already exists 9234 if "VAF" in vcf_reader.formats: 9235 log.debug("VAF already on genotypes") 9236 return 9237 9238 # Create variant id 9239 variant_id_column = self.get_variant_id_column() 9240 added_columns = [variant_id_column] 9241 9242 # variant_id, FORMAT and samples 9243 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9244 f""" "{sample}" """ for sample in self.get_header_sample_list() 9245 ) 9246 9247 # Create dataframe 9248 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9249 log.debug(f"query={query}") 9250 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9251 9252 vaf_normalization_set = [] 9253 9254 # for each sample vaf_normalization 9255 for sample in self.get_header_sample_list(): 9256 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9257 lambda row: vaf_normalization(row, sample=sample), axis=1 9258 ) 9259 vaf_normalization_set.append( 9260 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9261 ) 9262 9263 # Add VAF to FORMAT 9264 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9265 "FORMAT" 9266 ].apply(lambda x: str(x) + ":VAF") 9267 vaf_normalization_set.append( 9268 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9269 ) 9270 9271 # Add vaf_normalization to header 9272 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9273 id=vaf_normalization_tag, 9274 num="1", 9275 type="Float", 9276 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9277 type_code=self.code_type_map.get("Float"), 9278 ) 9279 9280 # Create fields to add in INFO 9281 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9282 9283 # Update 9284 sql_update = f""" 9285 UPDATE {table_variants} 9286 SET {sql_vaf_normalization_set} 9287 FROM dataframe_vaf_normalization 9288 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9289 9290 """ 9291 self.conn.execute(sql_update) 9292 9293 # Remove added columns 9294 for added_column in added_columns: 9295 self.drop_column(column=added_column) 9296 9297 # Delete dataframe 9298 del dataframe_vaf_normalization 9299 gc.collect() 9300 9301 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9302 """ 9303 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9304 field in a VCF file and updates the INFO column of the variants table with the calculated 9305 statistics. 9306 9307 :param info: The `info` parameter is a string that represents the type of information for which 9308 genotype statistics are calculated. It is used to generate various VCF info tags for the 9309 statistics, such as the number of occurrences, the list of values, the minimum value, the 9310 maximum value, the mean, the median, defaults to VAF 9311 :type info: str (optional) 9312 """ 9313 9314 # if FORMAT and samples 9315 if ( 9316 "FORMAT" in self.get_header_columns_as_list() 9317 and self.get_header_sample_list() 9318 ): 9319 9320 # vaf_stats annotation field 9321 vaf_stats_tag = info + "_stats" 9322 9323 # VCF infos tags 9324 vcf_infos_tags = { 9325 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9326 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9327 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9328 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9329 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9330 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9331 info 9332 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9333 } 9334 9335 # Prefix 9336 prefix = self.get_explode_infos_prefix() 9337 9338 # Field 9339 vaf_stats_infos = prefix + vaf_stats_tag 9340 9341 # Variants table 9342 table_variants = self.get_table_variants() 9343 9344 # Header 9345 vcf_reader = self.get_header() 9346 9347 # Create variant id 9348 variant_id_column = self.get_variant_id_column() 9349 added_columns = [variant_id_column] 9350 9351 # variant_id, FORMAT and samples 9352 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9353 self.get_header_sample_list() 9354 ) 9355 9356 # Create dataframe 9357 dataframe_vaf_stats = self.get_query_to_df( 9358 f""" SELECT {samples_fields} FROM {table_variants} """ 9359 ) 9360 9361 # Create vaf_stats column 9362 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9363 lambda row: genotype_stats( 9364 row, samples=self.get_header_sample_list(), info=info 9365 ), 9366 axis=1, 9367 ) 9368 9369 # List of vcf tags 9370 sql_vaf_stats_fields = [] 9371 9372 # Check all VAF stats infos 9373 for stat in vcf_infos_tags: 9374 9375 # Extract stats 9376 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9377 lambda x: dict(x).get(stat, "") 9378 ) 9379 9380 # Add snpeff_hgvs to header 9381 vcf_reader.infos[stat] = vcf.parser._Info( 9382 stat, 9383 ".", 9384 "String", 9385 vcf_infos_tags.get(stat, "genotype statistics"), 9386 "howard calculation", 9387 "0", 9388 self.code_type_map.get("String"), 9389 ) 9390 9391 if len(sql_vaf_stats_fields): 9392 sep = ";" 9393 else: 9394 sep = "" 9395 9396 # Create fields to add in INFO 9397 sql_vaf_stats_fields.append( 9398 f""" 9399 CASE 9400 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9401 THEN concat( 9402 '{sep}{stat}=', 9403 dataframe_vaf_stats."{stat}" 9404 ) 9405 ELSE '' 9406 END 9407 """ 9408 ) 9409 9410 # SQL set for update 9411 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9412 9413 # Update 9414 sql_update = f""" 9415 UPDATE {table_variants} 9416 SET "INFO" = 9417 concat( 9418 CASE 9419 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9420 THEN '' 9421 ELSE concat("INFO", ';') 9422 END, 9423 {sql_vaf_stats_fields_set} 9424 ) 9425 FROM dataframe_vaf_stats 9426 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9427 9428 """ 9429 self.conn.execute(sql_update) 9430 9431 # Remove added columns 9432 for added_column in added_columns: 9433 self.drop_column(column=added_column) 9434 9435 # Delete dataframe 9436 del dataframe_vaf_stats 9437 gc.collect() 9438 9439 def calculation_transcripts_annotation( 9440 self, info_json: str = None, info_format: str = None 9441 ) -> None: 9442 """ 9443 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9444 field to it if transcripts are available. 9445 9446 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9447 is a string parameter that represents the information field to be used in the transcripts JSON. 9448 It is used to specify the JSON format for the transcripts information. If no value is provided 9449 when calling the method, it defaults to " 9450 :type info_json: str 9451 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9452 method is a string parameter that specifies the format of the information field to be used in 9453 the transcripts JSON. It is used to define the format of the information field 9454 :type info_format: str 9455 """ 9456 9457 # Create transcripts table 9458 transcripts_table = self.create_transcript_view() 9459 9460 # Add info field 9461 if transcripts_table: 9462 self.transcript_view_to_variants( 9463 transcripts_table=transcripts_table, 9464 transcripts_info_field_json=info_json, 9465 transcripts_info_field_format=info_format, 9466 ) 9467 else: 9468 log.info("No Transcripts to process. Check param.json file configuration") 9469 9470 def calculation_transcripts_prioritization(self) -> None: 9471 """ 9472 The function `calculation_transcripts_prioritization` creates a transcripts table and 9473 prioritizes transcripts based on certain criteria. 9474 """ 9475 9476 # Create transcripts table 9477 transcripts_table = self.create_transcript_view() 9478 9479 # Add info field 9480 if transcripts_table: 9481 self.transcripts_prioritization(transcripts_table=transcripts_table) 9482 else: 9483 log.info("No Transcripts to process. Check param.json file configuration") 9484 9485 ############### 9486 # Transcripts # 9487 ############### 9488 9489 def transcripts_prioritization( 9490 self, transcripts_table: str = None, param: dict = {} 9491 ) -> bool: 9492 """ 9493 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9494 and updates the variants table with the prioritized information. 9495 9496 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9497 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9498 This parameter is used to identify the table where the transcripts data is stored for the 9499 prioritization process 9500 :type transcripts_table: str 9501 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9502 that contains various configuration settings for the prioritization process of transcripts. It 9503 is used to customize the behavior of the prioritization algorithm and includes settings such as 9504 the prefix for prioritization fields, default profiles, and other 9505 :type param: dict 9506 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9507 transcripts prioritization process is successfully completed, and `False` if there are any 9508 issues or if no profile is defined for transcripts prioritization. 9509 """ 9510 9511 log.debug("Start transcripts prioritization...") 9512 9513 # Param 9514 if not param: 9515 param = self.get_param() 9516 9517 # Variants table 9518 table_variants = self.get_table_variants() 9519 log.debug(f"transcripts_table={transcripts_table}") 9520 # Transcripts table 9521 if transcripts_table is None: 9522 log.debug(f"transcripts_table={transcripts_table}") 9523 transcripts_table = self.create_transcript_view( 9524 transcripts_table="transcripts", param=param 9525 ) 9526 log.debug(f"transcripts_table={transcripts_table}") 9527 if transcripts_table is None: 9528 msg_err = "No Transcripts table availalble" 9529 log.error(msg_err) 9530 raise ValueError(msg_err) 9531 9532 # Get transcripts columns 9533 columns_as_list_query = f""" 9534 DESCRIBE {transcripts_table} 9535 """ 9536 columns_as_list = list( 9537 self.get_query_to_df(columns_as_list_query)["column_name"] 9538 ) 9539 9540 # Create INFO if not exists 9541 if "INFO" not in columns_as_list: 9542 query_add_info = f""" 9543 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9544 """ 9545 self.execute_query(query_add_info) 9546 9547 # Prioritization param and Force only PZ Score and Flag 9548 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9549 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9550 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9551 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9552 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9553 pz_profile_default = ( 9554 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9555 ) 9556 9557 # Exit if no profile 9558 if pz_profile_default is None: 9559 log.warning("No profile defined for transcripts prioritization") 9560 return False 9561 9562 # Prioritization 9563 prioritization_result = self.prioritization( 9564 table=transcripts_table, 9565 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9566 ) 9567 if not prioritization_result: 9568 log.warning("Transcripts prioritization not processed") 9569 return False 9570 9571 # Explode PZ fields 9572 self.explode_infos( 9573 table=transcripts_table, 9574 fields=param.get("transcripts", {}) 9575 .get("prioritization", {}) 9576 .get("pzfields", []), 9577 ) 9578 9579 # Export Transcripts prioritization infos to variants table 9580 query_update = f""" 9581 WITH RankedTranscripts AS ( 9582 SELECT 9583 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9584 ROW_NUMBER() OVER ( 9585 PARTITION BY "#CHROM", POS, REF, ALT 9586 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9587 ) AS rn 9588 FROM 9589 {transcripts_table} 9590 ) 9591 UPDATE {table_variants} 9592 SET 9593 INFO = CONCAT(CASE 9594 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9595 THEN '' 9596 ELSE concat("INFO", ';') 9597 END, 9598 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9599 ) 9600 FROM 9601 RankedTranscripts 9602 WHERE 9603 rn = 1 9604 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9605 AND variants."POS" = RankedTranscripts."POS" 9606 AND variants."REF" = RankedTranscripts."REF" 9607 AND variants."ALT" = RankedTranscripts."ALT" 9608 9609 """ 9610 self.execute_query(query=query_update) 9611 9612 # Add PZ Transcript in header 9613 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9614 pz_fields_transcripts, 9615 ".", 9616 "String", 9617 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9618 "unknown", 9619 "unknown", 9620 code_type_map["String"], 9621 ) 9622 9623 # Return 9624 return True 9625 9626 def create_transcript_view_from_columns_map( 9627 self, 9628 transcripts_table: str = "transcripts", 9629 columns_maps: dict = {}, 9630 added_columns: list = [], 9631 temporary_tables: list = None, 9632 annotation_fields: list = None, 9633 ) -> tuple[list, list, list]: 9634 """ 9635 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9636 specified columns mapping for transcripts data. 9637 9638 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9639 the table where the transcripts data is stored or will be stored in the database. This table 9640 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9641 predictions, etc. It defaults to "transcripts, defaults to transcripts 9642 :type transcripts_table: str (optional) 9643 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9644 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9645 represents a mapping configuration for a specific set of columns. It typically includes details such 9646 as the main transcript column and additional information columns 9647 :type columns_maps: dict 9648 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9649 function is a list that stores the additional columns that will be added to the view being created 9650 based on the columns map provided. These columns are generated by exploding the transcript 9651 information columns along with the main transcript column 9652 :type added_columns: list 9653 :param temporary_tables: The `temporary_tables` parameter in the 9654 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9655 tables created during the process of creating a transcript view from a columns map. These temporary 9656 tables are used to store intermediate results or transformations before the final view is generated 9657 :type temporary_tables: list 9658 :param annotation_fields: The `annotation_fields` parameter in the 9659 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9660 for annotation in the query view creation process. These fields are extracted from the 9661 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9662 :type annotation_fields: list 9663 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9664 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9665 """ 9666 9667 log.debug("Start transcrpts view creation from columns map...") 9668 9669 # "from_columns_map": [ 9670 # { 9671 # "transcripts_column": "Ensembl_transcriptid", 9672 # "transcripts_infos_columns": [ 9673 # "genename", 9674 # "Ensembl_geneid", 9675 # "LIST_S2_score", 9676 # "LIST_S2_pred", 9677 # ], 9678 # }, 9679 # { 9680 # "transcripts_column": "Ensembl_transcriptid", 9681 # "transcripts_infos_columns": [ 9682 # "genename", 9683 # "VARITY_R_score", 9684 # "Aloft_pred", 9685 # ], 9686 # }, 9687 # ], 9688 9689 # Init 9690 if temporary_tables is None: 9691 temporary_tables = [] 9692 if annotation_fields is None: 9693 annotation_fields = [] 9694 9695 # Variants table 9696 table_variants = self.get_table_variants() 9697 9698 for columns_map in columns_maps: 9699 9700 # Transcript column 9701 transcripts_column = columns_map.get("transcripts_column", None) 9702 9703 # Transcripts infos columns 9704 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9705 9706 if transcripts_column is not None: 9707 9708 # Explode 9709 added_columns += self.explode_infos( 9710 fields=[transcripts_column] + transcripts_infos_columns 9711 ) 9712 9713 # View clauses 9714 clause_select = [] 9715 for field in [transcripts_column] + transcripts_infos_columns: 9716 clause_select.append( 9717 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9718 ) 9719 if field not in [transcripts_column]: 9720 annotation_fields.append(field) 9721 9722 # Querey View 9723 query = f""" 9724 SELECT 9725 "#CHROM", POS, REF, ALT, INFO, 9726 "{transcripts_column}" AS 'transcript', 9727 {", ".join(clause_select)} 9728 FROM ( 9729 SELECT 9730 "#CHROM", POS, REF, ALT, INFO, 9731 {", ".join(clause_select)} 9732 FROM {table_variants} 9733 ) 9734 WHERE "{transcripts_column}" IS NOT NULL 9735 """ 9736 9737 # Create temporary table 9738 temporary_table = transcripts_table + "".join( 9739 random.choices(string.ascii_uppercase + string.digits, k=10) 9740 ) 9741 9742 # Temporary_tables 9743 temporary_tables.append(temporary_table) 9744 query_view = f""" 9745 CREATE TEMPORARY TABLE {temporary_table} 9746 AS ({query}) 9747 """ 9748 self.execute_query(query=query_view) 9749 9750 return added_columns, temporary_tables, annotation_fields 9751 9752 def create_transcript_view_from_column_format( 9753 self, 9754 transcripts_table: str = "transcripts", 9755 column_formats: dict = {}, 9756 temporary_tables: list = None, 9757 annotation_fields: list = None, 9758 ) -> tuple[list, list, list]: 9759 """ 9760 The `create_transcript_view_from_column_format` function generates a transcript view based on 9761 specified column formats, adds additional columns and annotation fields, and returns the list of 9762 temporary tables and annotation fields. 9763 9764 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9765 the table containing the transcripts data. This table will be used as the base table for creating 9766 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9767 different table name if needed, defaults to transcripts 9768 :type transcripts_table: str (optional) 9769 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9770 about the columns to be used for creating the transcript view. Each entry in the dictionary 9771 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9772 the provided code snippet: 9773 :type column_formats: dict 9774 :param temporary_tables: The `temporary_tables` parameter in the 9775 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9776 views created during the process of creating a transcript view from a column format. These temporary 9777 views are used to manipulate and extract data before generating the final transcript view. It 9778 :type temporary_tables: list 9779 :param annotation_fields: The `annotation_fields` parameter in the 9780 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9781 that are extracted from the temporary views created during the process. These annotation fields are 9782 obtained by querying the temporary views and extracting the column names excluding specific columns 9783 like `#CH 9784 :type annotation_fields: list 9785 :return: The `create_transcript_view_from_column_format` function returns two lists: 9786 `temporary_tables` and `annotation_fields`. 9787 """ 9788 9789 log.debug("Start transcrpts view creation from column format...") 9790 9791 # "from_column_format": [ 9792 # { 9793 # "transcripts_column": "ANN", 9794 # "transcripts_infos_column": "Feature_ID", 9795 # } 9796 # ], 9797 9798 # Init 9799 if temporary_tables is None: 9800 temporary_tables = [] 9801 if annotation_fields is None: 9802 annotation_fields = [] 9803 9804 for column_format in column_formats: 9805 9806 # annotation field and transcript annotation field 9807 annotation_field = column_format.get("transcripts_column", "ANN") 9808 transcript_annotation = column_format.get( 9809 "transcripts_infos_column", "Feature_ID" 9810 ) 9811 9812 # Temporary View name 9813 temporary_view_name = transcripts_table + "".join( 9814 random.choices(string.ascii_uppercase + string.digits, k=10) 9815 ) 9816 9817 # Create temporary view name 9818 temporary_view_name = self.annotation_format_to_table( 9819 uniquify=True, 9820 annotation_field=annotation_field, 9821 view_name=temporary_view_name, 9822 annotation_id=transcript_annotation, 9823 ) 9824 9825 # Annotation fields 9826 if temporary_view_name: 9827 query_annotation_fields = f""" 9828 SELECT * 9829 FROM ( 9830 DESCRIBE SELECT * 9831 FROM {temporary_view_name} 9832 ) 9833 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9834 """ 9835 df_annotation_fields = self.get_query_to_df( 9836 query=query_annotation_fields 9837 ) 9838 9839 # Add temporary view and annotation fields 9840 temporary_tables.append(temporary_view_name) 9841 annotation_fields += list(set(df_annotation_fields["column_name"])) 9842 9843 return temporary_tables, annotation_fields 9844 9845 def create_transcript_view( 9846 self, 9847 transcripts_table: str = None, 9848 transcripts_table_drop: bool = True, 9849 param: dict = {}, 9850 ) -> str: 9851 """ 9852 The `create_transcript_view` function generates a transcript view by processing data from a 9853 specified table based on provided parameters and structural information. 9854 9855 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9856 is used to specify the name of the table that will store the final transcript view data. If a table 9857 name is not provided, the function will create a new table to store the transcript view data, and by 9858 default,, defaults to transcripts 9859 :type transcripts_table: str (optional) 9860 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9861 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9862 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9863 the function will drop the existing transcripts table if it exists, defaults to True 9864 :type transcripts_table_drop: bool (optional) 9865 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9866 contains information needed to create a transcript view. It includes details such as the structure 9867 of the transcripts, columns mapping, column formats, and other necessary information for generating 9868 the view. This parameter allows for flexibility and customization 9869 :type param: dict 9870 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9871 created or modified during the execution of the function. 9872 """ 9873 9874 log.debug("Start transcripts view creation...") 9875 9876 # Default 9877 transcripts_table_default = "transcripts" 9878 9879 # Param 9880 if not param: 9881 param = self.get_param() 9882 9883 # Struct 9884 struct = param.get("transcripts", {}).get("struct", None) 9885 9886 if struct: 9887 9888 # Transcripts table 9889 if transcripts_table is None: 9890 transcripts_table = param.get("transcripts", {}).get( 9891 "table", transcripts_table_default 9892 ) 9893 9894 # added_columns 9895 added_columns = [] 9896 9897 # Temporary tables 9898 temporary_tables = [] 9899 9900 # Annotation fields 9901 annotation_fields = [] 9902 9903 # from columns map 9904 columns_maps = struct.get("from_columns_map", []) 9905 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9906 self.create_transcript_view_from_columns_map( 9907 transcripts_table=transcripts_table, 9908 columns_maps=columns_maps, 9909 added_columns=added_columns, 9910 temporary_tables=temporary_tables, 9911 annotation_fields=annotation_fields, 9912 ) 9913 ) 9914 added_columns += added_columns_tmp 9915 temporary_tables += temporary_tables_tmp 9916 annotation_fields += annotation_fields_tmp 9917 9918 # from column format 9919 column_formats = struct.get("from_column_format", []) 9920 temporary_tables_tmp, annotation_fields_tmp = ( 9921 self.create_transcript_view_from_column_format( 9922 transcripts_table=transcripts_table, 9923 column_formats=column_formats, 9924 temporary_tables=temporary_tables, 9925 annotation_fields=annotation_fields, 9926 ) 9927 ) 9928 temporary_tables += temporary_tables_tmp 9929 annotation_fields += annotation_fields_tmp 9930 9931 # Merge temporary tables query 9932 query_merge = "" 9933 for temporary_table in temporary_tables: 9934 9935 # First temporary table 9936 if not query_merge: 9937 query_merge = f""" 9938 SELECT * FROM {temporary_table} 9939 """ 9940 # other temporary table (using UNION) 9941 else: 9942 query_merge += f""" 9943 UNION BY NAME SELECT * FROM {temporary_table} 9944 """ 9945 9946 # Merge on transcript 9947 query_merge_on_transcripts_annotation_fields = [] 9948 # Aggregate all annotations fields 9949 for annotation_field in set(annotation_fields): 9950 query_merge_on_transcripts_annotation_fields.append( 9951 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9952 ) 9953 # Query for transcripts view 9954 query_merge_on_transcripts = f""" 9955 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9956 FROM ({query_merge}) 9957 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 9958 """ 9959 9960 # Drop transcript view is necessary 9961 if transcripts_table_drop: 9962 query_drop = f""" 9963 DROP TABLE IF EXISTS {transcripts_table}; 9964 """ 9965 self.execute_query(query=query_drop) 9966 9967 # Merge and create transcript view 9968 query_create_view = f""" 9969 CREATE TABLE IF NOT EXISTS {transcripts_table} 9970 AS {query_merge_on_transcripts} 9971 """ 9972 self.execute_query(query=query_create_view) 9973 9974 # Remove added columns 9975 for added_column in added_columns: 9976 self.drop_column(column=added_column) 9977 9978 else: 9979 9980 transcripts_table = None 9981 9982 return transcripts_table 9983 9984 def annotation_format_to_table( 9985 self, 9986 uniquify: bool = True, 9987 annotation_field: str = "ANN", 9988 annotation_id: str = "Feature_ID", 9989 view_name: str = "transcripts", 9990 ) -> str: 9991 """ 9992 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9993 table format. 9994 9995 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9996 values in the output or not. If set to `True`, the function will make sure that the output values 9997 are unique, defaults to True 9998 :type uniquify: bool (optional) 9999 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10000 contains the annotation information for each variant. This field is used to extract the annotation 10001 details for further processing in the function, defaults to ANN 10002 :type annotation_field: str (optional) 10003 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10004 used to specify the identifier for the annotation feature. This identifier will be used as a column 10005 name in the resulting table or view that is created based on the annotation data. It helps in 10006 uniquely identifying each annotation entry in the, defaults to Feature_ID 10007 :type annotation_id: str (optional) 10008 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10009 specify the name of the temporary table that will be created to store the transformed annotation 10010 data. This table will hold the extracted information from the annotation field in a structured 10011 format for further processing or analysis, defaults to transcripts 10012 :type view_name: str (optional) 10013 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10014 is stored in the variable `view_name`. 10015 """ 10016 10017 # Annotation field 10018 annotation_format = "annotation_explode" 10019 10020 # Transcript annotation 10021 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10022 10023 # Prefix 10024 prefix = self.get_explode_infos_prefix() 10025 if prefix: 10026 prefix = "INFO/" 10027 10028 # Annotation fields 10029 annotation_infos = prefix + annotation_field 10030 annotation_format_infos = prefix + annotation_format 10031 10032 # Variants table 10033 table_variants = self.get_table_variants() 10034 10035 # Header 10036 vcf_reader = self.get_header() 10037 10038 # Add columns 10039 added_columns = [] 10040 10041 # Explode HGVS field in column 10042 added_columns += self.explode_infos(fields=[annotation_field]) 10043 10044 if annotation_field in vcf_reader.infos: 10045 10046 # Extract ANN header 10047 ann_description = vcf_reader.infos[annotation_field].desc 10048 pattern = r"'(.+?)'" 10049 match = re.search(pattern, ann_description) 10050 if match: 10051 ann_header_match = match.group(1).split(" | ") 10052 ann_header = [] 10053 ann_header_desc = {} 10054 for i in range(len(ann_header_match)): 10055 ann_header_info = "".join( 10056 char for char in ann_header_match[i] if char.isalnum() 10057 ) 10058 ann_header.append(ann_header_info) 10059 ann_header_desc[ann_header_info] = ann_header_match[i] 10060 if not ann_header_desc: 10061 raise ValueError("Invalid header description format") 10062 else: 10063 raise ValueError("Invalid header description format") 10064 10065 # Create variant id 10066 variant_id_column = self.get_variant_id_column() 10067 added_columns += [variant_id_column] 10068 10069 # Create dataframe 10070 dataframe_annotation_format = self.get_query_to_df( 10071 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10072 ) 10073 10074 # Create annotation columns 10075 dataframe_annotation_format[ 10076 annotation_format_infos 10077 ] = dataframe_annotation_format[annotation_infos].apply( 10078 lambda x: explode_annotation_format( 10079 annotation=str(x), 10080 uniquify=uniquify, 10081 output_format="JSON", 10082 prefix="", 10083 header=list(ann_header_desc.values()), 10084 ) 10085 ) 10086 10087 # Find keys 10088 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10089 df_keys = self.get_query_to_df(query=query_json) 10090 10091 # Check keys 10092 query_json_key = [] 10093 for _, row in df_keys.iterrows(): 10094 10095 # Key 10096 key = row.iloc[0] 10097 10098 # key_clean 10099 key_clean = "".join(char for char in key if char.isalnum()) 10100 10101 # Type 10102 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10103 10104 # Get DataFrame from query 10105 df_json_type = self.get_query_to_df(query=query_json_type) 10106 10107 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10108 with pd.option_context("future.no_silent_downcasting", True): 10109 df_json_type.fillna(value="", inplace=True) 10110 replace_dict = {None: np.nan, "": np.nan} 10111 df_json_type.replace(replace_dict, inplace=True) 10112 df_json_type.dropna(inplace=True) 10113 10114 # Detect column type 10115 column_type = detect_column_type(df_json_type[key_clean]) 10116 10117 # Append 10118 query_json_key.append( 10119 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10120 ) 10121 10122 # Create view 10123 query_view = f""" 10124 CREATE TEMPORARY TABLE {view_name} 10125 AS ( 10126 SELECT *, {annotation_id} AS 'transcript' 10127 FROM ( 10128 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10129 FROM dataframe_annotation_format 10130 ) 10131 ); 10132 """ 10133 self.execute_query(query=query_view) 10134 10135 else: 10136 10137 # Return None 10138 view_name = None 10139 10140 # Remove added columns 10141 for added_column in added_columns: 10142 self.drop_column(column=added_column) 10143 10144 return view_name 10145 10146 def transcript_view_to_variants( 10147 self, 10148 transcripts_table: str = None, 10149 transcripts_column_id: str = None, 10150 transcripts_info_json: str = None, 10151 transcripts_info_field_json: str = None, 10152 transcripts_info_format: str = None, 10153 transcripts_info_field_format: str = None, 10154 param: dict = {}, 10155 ) -> bool: 10156 """ 10157 The `transcript_view_to_variants` function updates a variants table with information from 10158 transcripts in JSON format. 10159 10160 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10161 table containing the transcripts data. If this parameter is not provided, the function will 10162 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10163 :type transcripts_table: str 10164 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10165 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10166 identifier is used to match transcripts with variants in the database 10167 :type transcripts_column_id: str 10168 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10169 of the column in the variants table where the transcripts information will be stored in JSON 10170 format. This parameter allows you to define the column in the variants table that will hold the 10171 JSON-formatted information about transcripts 10172 :type transcripts_info_json: str 10173 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10174 specify the field in the VCF header that will contain information about transcripts in JSON 10175 format. This field will be added to the VCF header as an INFO field with the specified name 10176 :type transcripts_info_field_json: str 10177 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10178 format of the information about transcripts that will be stored in the variants table. This 10179 format can be used to define how the transcript information will be structured or displayed 10180 within the variants table 10181 :type transcripts_info_format: str 10182 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10183 specify the field in the VCF header that will contain information about transcripts in a 10184 specific format. This field will be added to the VCF header as an INFO field with the specified 10185 name 10186 :type transcripts_info_field_format: str 10187 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10188 that contains various configuration settings related to transcripts. It is used to provide 10189 default values for certain parameters if they are not explicitly provided when calling the 10190 method. The `param` dictionary can be passed as an argument 10191 :type param: dict 10192 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10193 if the operation is successful and `False` if certain conditions are not met. 10194 """ 10195 10196 msg_info_prefix = "Start transcripts view to variants annotations" 10197 10198 log.debug(f"{msg_info_prefix}...") 10199 10200 # Default 10201 transcripts_table_default = "transcripts" 10202 transcripts_column_id_default = "transcript" 10203 transcripts_info_json_default = None 10204 transcripts_info_format_default = None 10205 transcripts_info_field_json_default = None 10206 transcripts_info_field_format_default = None 10207 10208 # Param 10209 if not param: 10210 param = self.get_param() 10211 10212 # Transcripts table 10213 if transcripts_table is None: 10214 transcripts_table = param.get("transcripts", {}).get( 10215 "table", transcripts_table_default 10216 ) 10217 10218 # Transcripts column ID 10219 if transcripts_column_id is None: 10220 transcripts_column_id = param.get("transcripts", {}).get( 10221 "column_id", transcripts_column_id_default 10222 ) 10223 10224 # Transcripts info json 10225 if transcripts_info_json is None: 10226 transcripts_info_json = param.get("transcripts", {}).get( 10227 "transcripts_info_json", transcripts_info_json_default 10228 ) 10229 10230 # Transcripts info field JSON 10231 if transcripts_info_field_json is None: 10232 transcripts_info_field_json = param.get("transcripts", {}).get( 10233 "transcripts_info_field_json", transcripts_info_field_json_default 10234 ) 10235 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10236 # transcripts_info_json = transcripts_info_field_json 10237 10238 # Transcripts info format 10239 if transcripts_info_format is None: 10240 transcripts_info_format = param.get("transcripts", {}).get( 10241 "transcripts_info_format", transcripts_info_format_default 10242 ) 10243 10244 # Transcripts info field FORMAT 10245 if transcripts_info_field_format is None: 10246 transcripts_info_field_format = param.get("transcripts", {}).get( 10247 "transcripts_info_field_format", transcripts_info_field_format_default 10248 ) 10249 # if ( 10250 # transcripts_info_field_format is not None 10251 # and transcripts_info_format is None 10252 # ): 10253 # transcripts_info_format = transcripts_info_field_format 10254 10255 # Variants table 10256 table_variants = self.get_table_variants() 10257 10258 # Check info columns param 10259 if ( 10260 transcripts_info_json is None 10261 and transcripts_info_field_json is None 10262 and transcripts_info_format is None 10263 and transcripts_info_field_format is None 10264 ): 10265 return False 10266 10267 # Transcripts infos columns 10268 query_transcripts_infos_columns = f""" 10269 SELECT * 10270 FROM ( 10271 DESCRIBE SELECT * FROM {transcripts_table} 10272 ) 10273 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10274 """ 10275 transcripts_infos_columns = list( 10276 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10277 ) 10278 10279 # View results 10280 clause_select = [] 10281 clause_to_json = [] 10282 clause_to_format = [] 10283 for field in transcripts_infos_columns: 10284 clause_select.append( 10285 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10286 ) 10287 clause_to_json.append(f""" '{field}': "{field}" """) 10288 clause_to_format.append(f""" "{field}" """) 10289 10290 # Update 10291 update_set_json = [] 10292 update_set_format = [] 10293 10294 # VCF header 10295 vcf_reader = self.get_header() 10296 10297 # Transcripts to info column in JSON 10298 if transcripts_info_json is not None: 10299 10300 # Create column on variants table 10301 self.add_column( 10302 table_name=table_variants, 10303 column_name=transcripts_info_json, 10304 column_type="JSON", 10305 default_value=None, 10306 drop=False, 10307 ) 10308 10309 # Add header 10310 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10311 transcripts_info_json, 10312 ".", 10313 "String", 10314 "Transcripts in JSON format", 10315 "unknwon", 10316 "unknwon", 10317 self.code_type_map["String"], 10318 ) 10319 10320 # Add to update 10321 update_set_json.append( 10322 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10323 ) 10324 10325 # Transcripts to info field in JSON 10326 if transcripts_info_field_json is not None: 10327 10328 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10329 10330 # Add to update 10331 update_set_json.append( 10332 f""" 10333 INFO = concat( 10334 CASE 10335 WHEN INFO NOT IN ('', '.') 10336 THEN INFO 10337 ELSE '' 10338 END, 10339 CASE 10340 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10341 THEN concat( 10342 ';{transcripts_info_field_json}=', 10343 t.{transcripts_info_json} 10344 ) 10345 ELSE '' 10346 END 10347 ) 10348 """ 10349 ) 10350 10351 # Add header 10352 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10353 transcripts_info_field_json, 10354 ".", 10355 "String", 10356 "Transcripts in JSON format", 10357 "unknwon", 10358 "unknwon", 10359 self.code_type_map["String"], 10360 ) 10361 10362 if update_set_json: 10363 10364 # Update query 10365 query_update = f""" 10366 UPDATE {table_variants} 10367 SET {", ".join(update_set_json)} 10368 FROM 10369 ( 10370 SELECT 10371 "#CHROM", POS, REF, ALT, 10372 concat( 10373 '{{', 10374 string_agg( 10375 '"' || "{transcripts_column_id}" || '":' || 10376 to_json(json_output) 10377 ), 10378 '}}' 10379 )::JSON AS {transcripts_info_json} 10380 FROM 10381 ( 10382 SELECT 10383 "#CHROM", POS, REF, ALT, 10384 "{transcripts_column_id}", 10385 to_json( 10386 {{{",".join(clause_to_json)}}} 10387 )::JSON AS json_output 10388 FROM 10389 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10390 WHERE "{transcripts_column_id}" IS NOT NULL 10391 ) 10392 GROUP BY "#CHROM", POS, REF, ALT 10393 ) AS t 10394 WHERE {table_variants}."#CHROM" = t."#CHROM" 10395 AND {table_variants}."POS" = t."POS" 10396 AND {table_variants}."REF" = t."REF" 10397 AND {table_variants}."ALT" = t."ALT" 10398 """ 10399 10400 self.execute_query(query=query_update) 10401 10402 # Transcripts to info column in FORMAT 10403 if transcripts_info_format is not None: 10404 10405 # Create column on variants table 10406 self.add_column( 10407 table_name=table_variants, 10408 column_name=transcripts_info_format, 10409 column_type="VARCHAR", 10410 default_value=None, 10411 drop=False, 10412 ) 10413 10414 # Add header 10415 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10416 transcripts_info_format, 10417 ".", 10418 "String", 10419 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10420 "unknwon", 10421 "unknwon", 10422 self.code_type_map["String"], 10423 ) 10424 10425 # Add to update 10426 update_set_format.append( 10427 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10428 ) 10429 10430 # Transcripts to info field in JSON 10431 if transcripts_info_field_format is not None: 10432 10433 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10434 10435 # Add to update 10436 update_set_format.append( 10437 f""" 10438 INFO = concat( 10439 CASE 10440 WHEN INFO NOT IN ('', '.') 10441 THEN INFO 10442 ELSE '' 10443 END, 10444 CASE 10445 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10446 THEN concat( 10447 ';{transcripts_info_field_format}=', 10448 t.{transcripts_info_format} 10449 ) 10450 ELSE '' 10451 END 10452 ) 10453 """ 10454 ) 10455 10456 # Add header 10457 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10458 transcripts_info_field_format, 10459 ".", 10460 "String", 10461 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10462 "unknwon", 10463 "unknwon", 10464 self.code_type_map["String"], 10465 ) 10466 10467 if update_set_format: 10468 10469 # Update query 10470 query_update = f""" 10471 UPDATE {table_variants} 10472 SET {", ".join(update_set_format)} 10473 FROM 10474 ( 10475 SELECT 10476 "#CHROM", POS, REF, ALT, 10477 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10478 FROM 10479 ( 10480 SELECT 10481 "#CHROM", POS, REF, ALT, 10482 "{transcripts_column_id}", 10483 concat( 10484 "{transcripts_column_id}", 10485 '|', 10486 {", '|', ".join(clause_to_format)} 10487 ) AS {transcripts_info_format} 10488 FROM 10489 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10490 ) 10491 GROUP BY "#CHROM", POS, REF, ALT 10492 ) AS t 10493 WHERE {table_variants}."#CHROM" = t."#CHROM" 10494 AND {table_variants}."POS" = t."POS" 10495 AND {table_variants}."REF" = t."REF" 10496 AND {table_variants}."ALT" = t."ALT" 10497 """ 10498 10499 self.execute_query(query=query_update) 10500 10501 return True
34class Variants: 35 36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Samples 78 self.set_samples() 79 80 # Load data 81 if load: 82 self.load_data() 83 84 def set_samples(self, samples: list = None) -> list: 85 """ 86 The function `set_samples` sets the samples attribute of an object to a provided list or 87 retrieves it from a parameter dictionary. 88 89 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 90 input and sets the `samples` attribute of the class to the provided list. If no samples are 91 provided, it tries to get the samples from the class's parameters using the `get_param` method 92 :type samples: list 93 :return: The `samples` list is being returned. 94 """ 95 96 if not samples: 97 samples = self.get_param().get("samples", {}).get("list", None) 98 99 self.samples = samples 100 101 return samples 102 103 def get_samples(self) -> list: 104 """ 105 This function returns a list of samples. 106 :return: The `get_samples` method is returning the `samples` attribute of the object. 107 """ 108 109 return self.samples 110 111 def get_samples_check(self) -> bool: 112 """ 113 This function returns the value of the "check" key within the "samples" dictionary retrieved 114 from the parameters. 115 :return: The method `get_samples_check` is returning the value of the key "check" inside the 116 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 117 method. If the key "check" is not found, it will return `False`. 118 """ 119 120 return self.get_param().get("samples", {}).get("check", True) 121 122 def set_input(self, input: str = None) -> None: 123 """ 124 The function `set_input` takes a file name as input, extracts the name and extension, and sets 125 attributes in the class accordingly. 126 127 :param input: The `set_input` method in the provided code snippet is used to set attributes 128 related to the input file. Here's a breakdown of the parameters and their usage in the method: 129 :type input: str 130 """ 131 132 if input and not isinstance(input, str): 133 try: 134 self.input = input.name 135 except: 136 log.error(f"Input file '{input} in bad format") 137 raise ValueError(f"Input file '{input} in bad format") 138 else: 139 self.input = input 140 141 # Input format 142 if input: 143 input_name, input_extension = os.path.splitext(self.input) 144 self.input_name = input_name 145 self.input_extension = input_extension 146 self.input_format = self.input_extension.replace(".", "") 147 148 def set_config(self, config: dict) -> None: 149 """ 150 The set_config function takes a config object and assigns it as the configuration object for the 151 class. 152 153 :param config: The `config` parameter in the `set_config` function is a dictionary object that 154 contains configuration settings for the class. When you call the `set_config` function with a 155 dictionary object as the argument, it will set that dictionary as the configuration object for 156 the class 157 :type config: dict 158 """ 159 160 self.config = config 161 162 def set_param(self, param: dict) -> None: 163 """ 164 This function sets a parameter object for the class based on the input dictionary. 165 166 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 167 as the `param` attribute of the class instance 168 :type param: dict 169 """ 170 171 self.param = param 172 173 def init_variables(self) -> None: 174 """ 175 This function initializes the variables that will be used in the rest of the class 176 """ 177 178 self.prefix = "howard" 179 self.table_variants = "variants" 180 self.dataframe = None 181 182 self.comparison_map = { 183 "gt": ">", 184 "gte": ">=", 185 "lt": "<", 186 "lte": "<=", 187 "equals": "=", 188 "contains": "SIMILAR TO", 189 } 190 191 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 192 193 self.code_type_map_to_sql = { 194 "Integer": "INTEGER", 195 "String": "VARCHAR", 196 "Float": "FLOAT", 197 "Flag": "VARCHAR", 198 } 199 200 self.index_additionnal_fields = [] 201 202 def get_indexing(self) -> bool: 203 """ 204 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 205 returns False. 206 :return: The value of the indexing parameter. 207 """ 208 209 return self.get_param().get("indexing", False) 210 211 def get_connexion_config(self) -> dict: 212 """ 213 The function `get_connexion_config` returns a dictionary containing the configuration for a 214 connection, including the number of threads and memory limit. 215 :return: a dictionary containing the configuration for the Connexion library. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # Connexion config 222 connexion_config = {} 223 threads = self.get_threads() 224 225 # Threads 226 if threads: 227 connexion_config["threads"] = threads 228 229 # Memory 230 # if config.get("memory", None): 231 # connexion_config["memory_limit"] = config.get("memory") 232 if self.get_memory(): 233 connexion_config["memory_limit"] = self.get_memory() 234 235 # Temporary directory 236 if config.get("tmp", None): 237 connexion_config["temp_directory"] = config.get("tmp") 238 239 # Access 240 if config.get("access", None): 241 access = config.get("access") 242 if access in ["RO"]: 243 access = "READ_ONLY" 244 elif access in ["RW"]: 245 access = "READ_WRITE" 246 connexion_db = self.get_connexion_db() 247 if connexion_db in ":memory:": 248 access = "READ_WRITE" 249 connexion_config["access_mode"] = access 250 251 return connexion_config 252 253 def get_duckdb_settings(self) -> dict: 254 """ 255 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 256 string. 257 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 258 """ 259 260 # config 261 config = self.get_config() 262 263 # duckdb settings 264 duckdb_settings_dict = {} 265 if config.get("duckdb_settings", None): 266 duckdb_settings = config.get("duckdb_settings") 267 duckdb_settings = full_path(duckdb_settings) 268 # duckdb setting is a file 269 if os.path.exists(duckdb_settings): 270 with open(duckdb_settings) as json_file: 271 duckdb_settings_dict = yaml.safe_load(json_file) 272 # duckdb settings is a string 273 else: 274 duckdb_settings_dict = json.loads(duckdb_settings) 275 276 return duckdb_settings_dict 277 278 def set_connexion_db(self) -> str: 279 """ 280 The function `set_connexion_db` returns the appropriate database connection string based on the 281 input format and connection type. 282 :return: the value of the variable `connexion_db`. 283 """ 284 285 # Default connexion db 286 default_connexion_db = ":memory:" 287 288 # Find connexion db 289 if self.get_input_format() in ["db", "duckdb"]: 290 connexion_db = self.get_input() 291 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 292 connexion_db = default_connexion_db 293 elif self.get_connexion_type() in ["tmpfile"]: 294 tmp_name = tempfile.mkdtemp( 295 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 296 ) 297 connexion_db = f"{tmp_name}/tmp.db" 298 elif self.get_connexion_type() != "": 299 connexion_db = self.get_connexion_type() 300 else: 301 connexion_db = default_connexion_db 302 303 # Set connexion db 304 self.connexion_db = connexion_db 305 306 return connexion_db 307 308 def set_connexion(self, conn) -> None: 309 """ 310 The function `set_connexion` creates a connection to a database, with options for different 311 database formats and settings. 312 313 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 314 database. If a connection is not provided, a new connection to an in-memory database is created. 315 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 316 sqlite 317 """ 318 319 # Connexion db 320 connexion_db = self.set_connexion_db() 321 322 # Connexion config 323 connexion_config = self.get_connexion_config() 324 325 # Connexion format 326 connexion_format = self.get_config().get("connexion_format", "duckdb") 327 # Set connexion format 328 self.connexion_format = connexion_format 329 330 # Connexion 331 if not conn: 332 if connexion_format in ["duckdb"]: 333 conn = duckdb.connect(connexion_db, config=connexion_config) 334 # duckDB settings 335 duckdb_settings = self.get_duckdb_settings() 336 if duckdb_settings: 337 for setting in duckdb_settings: 338 setting_value = duckdb_settings.get(setting) 339 if isinstance(setting_value, str): 340 setting_value = f"'{setting_value}'" 341 conn.execute(f"PRAGMA {setting}={setting_value};") 342 elif connexion_format in ["sqlite"]: 343 conn = sqlite3.connect(connexion_db) 344 345 # Set connexion 346 self.conn = conn 347 348 # Log 349 log.debug(f"connexion_format: {connexion_format}") 350 log.debug(f"connexion_db: {connexion_db}") 351 log.debug(f"connexion config: {connexion_config}") 352 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 353 354 def set_output(self, output: str = None) -> None: 355 """ 356 The `set_output` function in Python sets the output file based on the input or a specified key 357 in the config file, extracting the output name, extension, and format. 358 359 :param output: The `output` parameter in the `set_output` method is used to specify the name of 360 the output file. If the config file has an 'output' key, the method sets the output to the value 361 of that key. If no output is provided, it sets the output to `None` 362 :type output: str 363 """ 364 365 if output and not isinstance(output, str): 366 self.output = output.name 367 else: 368 self.output = output 369 370 # Output format 371 if self.output: 372 output_name, output_extension = os.path.splitext(self.output) 373 self.output_name = output_name 374 self.output_extension = output_extension 375 self.output_format = self.output_extension.replace(".", "") 376 else: 377 self.output_name = None 378 self.output_extension = None 379 self.output_format = None 380 381 def set_header(self) -> None: 382 """ 383 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 384 """ 385 386 input_file = self.get_input() 387 default_header_list = [ 388 "##fileformat=VCFv4.2", 389 "#CHROM POS ID REF ALT QUAL FILTER INFO", 390 ] 391 392 # Full path 393 input_file = full_path(input_file) 394 395 if input_file: 396 397 input_format = self.get_input_format() 398 input_compressed = self.get_input_compressed() 399 config = self.get_config() 400 header_list = default_header_list 401 if input_format in [ 402 "vcf", 403 "hdr", 404 "tsv", 405 "csv", 406 "psv", 407 "parquet", 408 "db", 409 "duckdb", 410 ]: 411 # header provided in param 412 if config.get("header_file", None): 413 with open(config.get("header_file"), "rt") as f: 414 header_list = self.read_vcf_header(f) 415 # within a vcf file format (header within input file itsself) 416 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 417 # within a compressed vcf file format (.vcf.gz) 418 if input_compressed: 419 with bgzf.open(input_file, "rt") as f: 420 header_list = self.read_vcf_header(f) 421 # within an uncompressed vcf file format (.vcf) 422 else: 423 with open(input_file, "rt") as f: 424 header_list = self.read_vcf_header(f) 425 # header provided in default external file .hdr 426 elif os.path.exists((input_file + ".hdr")): 427 with open(input_file + ".hdr", "rt") as f: 428 header_list = self.read_vcf_header(f) 429 else: 430 try: # Try to get header info fields and file columns 431 432 with tempfile.TemporaryDirectory() as tmpdir: 433 434 # Create database 435 db_for_header = Database(database=input_file) 436 437 # Get header columns for infos fields 438 db_header_from_columns = ( 439 db_for_header.get_header_from_columns() 440 ) 441 442 # Get real columns in the file 443 db_header_columns = db_for_header.get_columns() 444 445 # Write header file 446 header_file_tmp = os.path.join(tmpdir, "header") 447 f = open(header_file_tmp, "w") 448 vcf.Writer(f, db_header_from_columns) 449 f.close() 450 451 # Replace #CHROM line with rel columns 452 header_list = db_for_header.read_header_file( 453 header_file=header_file_tmp 454 ) 455 header_list[-1] = "\t".join(db_header_columns) 456 457 except: 458 459 log.warning( 460 f"No header for file {input_file}. Set as default VCF header" 461 ) 462 header_list = default_header_list 463 464 else: # try for unknown format ? 465 466 log.error(f"Input file format '{input_format}' not available") 467 raise ValueError(f"Input file format '{input_format}' not available") 468 469 if not header_list: 470 header_list = default_header_list 471 472 # header as list 473 self.header_list = header_list 474 475 # header as VCF object 476 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 477 478 else: 479 480 self.header_list = None 481 self.header_vcf = None 482 483 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 484 """ 485 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 486 DataFrame based on the connection format. 487 488 :param query: The `query` parameter in the `get_query_to_df` function is a string that 489 represents the SQL query you want to execute. This query will be used to fetch data from a 490 database and convert it into a pandas DataFrame 491 :type query: str 492 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 493 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 494 function will only fetch up to that number of rows from the database query result. If no limit 495 is specified, 496 :type limit: int 497 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 498 """ 499 500 # Connexion format 501 connexion_format = self.get_connexion_format() 502 503 # Limit in query 504 if limit: 505 pd.set_option("display.max_rows", limit) 506 if connexion_format in ["duckdb"]: 507 df = ( 508 self.conn.execute(query) 509 .fetch_record_batch(limit) 510 .read_next_batch() 511 .to_pandas() 512 ) 513 elif connexion_format in ["sqlite"]: 514 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 515 516 # Full query 517 else: 518 if connexion_format in ["duckdb"]: 519 df = self.conn.execute(query).df() 520 elif connexion_format in ["sqlite"]: 521 df = pd.read_sql_query(query, self.conn) 522 523 return df 524 525 def get_overview(self) -> None: 526 """ 527 The function prints the input, output, config, and dataframe of the current object 528 """ 529 table_variants_from = self.get_table_variants(clause="from") 530 sql_columns = self.get_header_columns_as_sql() 531 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 532 df = self.get_query_to_df(sql_query_export) 533 log.info( 534 "Input: " 535 + str(self.get_input()) 536 + " [" 537 + str(str(self.get_input_format())) 538 + "]" 539 ) 540 log.info( 541 "Output: " 542 + str(self.get_output()) 543 + " [" 544 + str(str(self.get_output_format())) 545 + "]" 546 ) 547 log.info("Config: ") 548 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 549 "\n" 550 ): 551 log.info("\t" + str(d)) 552 log.info("Param: ") 553 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 554 "\n" 555 ): 556 log.info("\t" + str(d)) 557 log.info("Sample list: " + str(self.get_header_sample_list())) 558 log.info("Dataframe: ") 559 for d in str(df).split("\n"): 560 log.info("\t" + str(d)) 561 562 # garbage collector 563 del df 564 gc.collect() 565 566 return None 567 568 def get_stats(self) -> dict: 569 """ 570 The `get_stats` function calculates and returns various statistics of the current object, 571 including information about the input file, variants, samples, header fields, quality, and 572 SNVs/InDels. 573 :return: a dictionary containing various statistics of the current object. The dictionary has 574 the following structure: 575 """ 576 577 # Log 578 log.info(f"Stats Calculation...") 579 580 # table varaints 581 table_variants_from = self.get_table_variants() 582 583 # stats dict 584 stats = {"Infos": {}} 585 586 ### File 587 input_file = self.get_input() 588 stats["Infos"]["Input file"] = input_file 589 590 # Header 591 header_infos = self.get_header().infos 592 header_formats = self.get_header().formats 593 header_infos_list = list(header_infos) 594 header_formats_list = list(header_formats) 595 596 ### Variants 597 598 stats["Variants"] = {} 599 600 # Variants by chr 601 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 602 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 603 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 604 by=["CHROM"], kind="quicksort" 605 ) 606 607 # Total number of variants 608 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 609 610 # Calculate percentage 611 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 612 lambda x: (x / nb_of_variants) 613 ) 614 615 stats["Variants"]["Number of variants by chromosome"] = ( 616 nb_of_variants_by_chrom.to_dict(orient="index") 617 ) 618 619 stats["Infos"]["Number of variants"] = int(nb_of_variants) 620 621 ### Samples 622 623 # Init 624 samples = {} 625 nb_of_samples = 0 626 627 # Check Samples 628 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 629 log.debug(f"Check samples...") 630 for sample in self.get_header_sample_list(): 631 sql_query_samples = f""" 632 SELECT '{sample}' as sample, 633 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 634 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 635 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 636 FROM {table_variants_from} 637 WHERE ( 638 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 639 AND 640 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 641 ) 642 GROUP BY genotype 643 """ 644 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 645 sample_genotype_count = sql_query_genotype_df["count"].sum() 646 if len(sql_query_genotype_df): 647 nb_of_samples += 1 648 samples[f"{sample} - {sample_genotype_count} variants"] = ( 649 sql_query_genotype_df.to_dict(orient="index") 650 ) 651 652 stats["Samples"] = samples 653 stats["Infos"]["Number of samples"] = nb_of_samples 654 655 # # 656 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 657 # stats["Infos"]["Number of samples"] = nb_of_samples 658 # elif nb_of_samples: 659 # stats["Infos"]["Number of samples"] = "not a VCF format" 660 661 ### INFO and FORMAT fields 662 header_types_df = {} 663 header_types_list = { 664 "List of INFO fields": header_infos, 665 "List of FORMAT fields": header_formats, 666 } 667 i = 0 668 for header_type in header_types_list: 669 670 header_type_infos = header_types_list.get(header_type) 671 header_infos_dict = {} 672 673 for info in header_type_infos: 674 675 i += 1 676 header_infos_dict[i] = {} 677 678 # ID 679 header_infos_dict[i]["id"] = info 680 681 # num 682 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 683 if header_type_infos[info].num in genotype_map.keys(): 684 header_infos_dict[i]["Number"] = genotype_map.get( 685 header_type_infos[info].num 686 ) 687 else: 688 header_infos_dict[i]["Number"] = header_type_infos[info].num 689 690 # type 691 if header_type_infos[info].type: 692 header_infos_dict[i]["Type"] = header_type_infos[info].type 693 else: 694 header_infos_dict[i]["Type"] = "." 695 696 # desc 697 if header_type_infos[info].desc != None: 698 header_infos_dict[i]["Description"] = header_type_infos[info].desc 699 else: 700 header_infos_dict[i]["Description"] = "" 701 702 if len(header_infos_dict): 703 header_types_df[header_type] = pd.DataFrame.from_dict( 704 header_infos_dict, orient="index" 705 ).to_dict(orient="index") 706 707 # Stats 708 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 709 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 710 stats["Header"] = header_types_df 711 712 ### QUAL 713 if "QUAL" in self.get_header_columns(): 714 sql_query_qual = f""" 715 SELECT 716 avg(CAST(QUAL AS INTEGER)) AS Average, 717 min(CAST(QUAL AS INTEGER)) AS Minimum, 718 max(CAST(QUAL AS INTEGER)) AS Maximum, 719 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 720 median(CAST(QUAL AS INTEGER)) AS Median, 721 variance(CAST(QUAL AS INTEGER)) AS Variance 722 FROM {table_variants_from} 723 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 724 """ 725 726 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 727 stats["Quality"] = {"Stats": qual} 728 729 ### SNV and InDel 730 731 sql_query_snv = f""" 732 733 SELECT Type, count FROM ( 734 735 SELECT 736 'Total' AS Type, 737 count(*) AS count 738 FROM {table_variants_from} 739 740 UNION 741 742 SELECT 743 'MNV' AS Type, 744 count(*) AS count 745 FROM {table_variants_from} 746 WHERE len(REF) > 1 AND len(ALT) > 1 747 AND len(REF) = len(ALT) 748 749 UNION 750 751 SELECT 752 'InDel' AS Type, 753 count(*) AS count 754 FROM {table_variants_from} 755 WHERE len(REF) > 1 OR len(ALT) > 1 756 AND len(REF) != len(ALT) 757 758 UNION 759 760 SELECT 761 'SNV' AS Type, 762 count(*) AS count 763 FROM {table_variants_from} 764 WHERE len(REF) = 1 AND len(ALT) = 1 765 766 ) 767 768 ORDER BY count DESC 769 770 """ 771 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 772 773 sql_query_snv_substitution = f""" 774 SELECT 775 concat(REF, '>', ALT) AS 'Substitution', 776 count(*) AS count 777 FROM {table_variants_from} 778 WHERE len(REF) = 1 AND len(ALT) = 1 779 GROUP BY REF, ALT 780 ORDER BY count(*) DESC 781 """ 782 snv_substitution = ( 783 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 784 ) 785 stats["Variants"]["Counts"] = snv_indel 786 stats["Variants"]["Substitutions"] = snv_substitution 787 788 return stats 789 790 def stats_to_file(self, file: str = None) -> str: 791 """ 792 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 793 into a JSON object, and writes the JSON object to the specified file. 794 795 :param file: The `file` parameter is a string that represents the file path where the JSON data 796 will be written 797 :type file: str 798 :return: the name of the file that was written to. 799 """ 800 801 # Get stats 802 stats = self.get_stats() 803 804 # Serializing json 805 json_object = json.dumps(stats, indent=4) 806 807 # Writing to sample.json 808 with open(file, "w") as outfile: 809 outfile.write(json_object) 810 811 return file 812 813 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 814 """ 815 The `print_stats` function generates a markdown file and prints the statistics contained in a 816 JSON file in a formatted manner. 817 818 :param output_file: The `output_file` parameter is a string that specifies the path and filename 819 of the output file where the stats will be printed in Markdown format. If no `output_file` is 820 provided, a temporary directory will be created and the stats will be saved in a file named 821 "stats.md" within that 822 :type output_file: str 823 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 824 file where the statistics will be saved. If no value is provided, a temporary directory will be 825 created and a default file name "stats.json" will be used 826 :type json_file: str 827 :return: The function `print_stats` does not return any value. It has a return type annotation 828 of `None`. 829 """ 830 831 # Full path 832 output_file = full_path(output_file) 833 json_file = full_path(json_file) 834 835 with tempfile.TemporaryDirectory() as tmpdir: 836 837 # Files 838 if not output_file: 839 output_file = os.path.join(tmpdir, "stats.md") 840 if not json_file: 841 json_file = os.path.join(tmpdir, "stats.json") 842 843 # Create folders 844 if not os.path.exists(os.path.dirname(output_file)): 845 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 846 if not os.path.exists(os.path.dirname(json_file)): 847 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 848 849 # Create stats JSON file 850 stats_file = self.stats_to_file(file=json_file) 851 852 # Print stats file 853 with open(stats_file) as f: 854 stats = yaml.safe_load(f) 855 856 # Output 857 output_title = [] 858 output_index = [] 859 output = [] 860 861 # Title 862 output_title.append("# HOWARD Stats") 863 864 # Index 865 output_index.append("## Index") 866 867 # Process sections 868 for section in stats: 869 infos = stats.get(section) 870 section_link = "#" + section.lower().replace(" ", "-") 871 output.append(f"## {section}") 872 output_index.append(f"- [{section}]({section_link})") 873 874 if len(infos): 875 for info in infos: 876 try: 877 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 878 is_df = True 879 except: 880 try: 881 df = pd.DataFrame.from_dict( 882 json.loads((infos.get(info))), orient="index" 883 ) 884 is_df = True 885 except: 886 is_df = False 887 if is_df: 888 output.append(f"### {info}") 889 info_link = "#" + info.lower().replace(" ", "-") 890 output_index.append(f" - [{info}]({info_link})") 891 output.append(f"{df.to_markdown(index=False)}") 892 else: 893 output.append(f"- {info}: {infos.get(info)}") 894 else: 895 output.append(f"NA") 896 897 # Write stats in markdown file 898 with open(output_file, "w") as fp: 899 for item in output_title: 900 fp.write("%s\n" % item) 901 for item in output_index: 902 fp.write("%s\n" % item) 903 for item in output: 904 fp.write("%s\n" % item) 905 906 # Output stats in markdown 907 print("") 908 print("\n\n".join(output_title)) 909 print("") 910 print("\n\n".join(output)) 911 print("") 912 913 return None 914 915 def get_input(self) -> str: 916 """ 917 It returns the value of the input variable. 918 :return: The input is being returned. 919 """ 920 return self.input 921 922 def get_input_format(self, input_file: str = None) -> str: 923 """ 924 This function returns the format of the input variable, either from the provided input file or 925 by prompting for input. 926 927 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 928 represents the file path of the input file. If no `input_file` is provided when calling the 929 method, it will default to `None` 930 :type input_file: str 931 :return: The format of the input variable is being returned. 932 """ 933 934 if not input_file: 935 input_file = self.get_input() 936 input_format = get_file_format(input_file) 937 return input_format 938 939 def get_input_compressed(self, input_file: str = None) -> str: 940 """ 941 The function `get_input_compressed` returns the format of the input variable after compressing 942 it. 943 944 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 945 that represents the file path of the input file. If no `input_file` is provided when calling the 946 method, it will default to `None` and the method will then call `self.get_input()` to 947 :type input_file: str 948 :return: The function `get_input_compressed` returns the compressed format of the input 949 variable. 950 """ 951 952 if not input_file: 953 input_file = self.get_input() 954 input_compressed = get_file_compressed(input_file) 955 return input_compressed 956 957 def get_output(self) -> str: 958 """ 959 It returns the output of the neuron. 960 :return: The output of the neural network. 961 """ 962 963 return self.output 964 965 def get_output_format(self, output_file: str = None) -> str: 966 """ 967 The function `get_output_format` returns the format of the input variable or the output file if 968 provided. 969 970 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 971 that represents the file path of the output file. If no `output_file` is provided when calling 972 the method, it will default to the output obtained from the `get_output` method of the class 973 instance. The 974 :type output_file: str 975 :return: The format of the input variable is being returned. 976 """ 977 978 if not output_file: 979 output_file = self.get_output() 980 output_format = get_file_format(output_file) 981 982 return output_format 983 984 def get_config(self) -> dict: 985 """ 986 It returns the config 987 :return: The config variable is being returned. 988 """ 989 return self.config 990 991 def get_param(self) -> dict: 992 """ 993 It returns the param 994 :return: The param variable is being returned. 995 """ 996 return self.param 997 998 def get_connexion_db(self) -> str: 999 """ 1000 It returns the connexion_db attribute of the object 1001 :return: The connexion_db is being returned. 1002 """ 1003 return self.connexion_db 1004 1005 def get_prefix(self) -> str: 1006 """ 1007 It returns the prefix of the object. 1008 :return: The prefix is being returned. 1009 """ 1010 return self.prefix 1011 1012 def get_table_variants(self, clause: str = "select") -> str: 1013 """ 1014 This function returns the table_variants attribute of the object 1015 1016 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1017 defaults to select (optional) 1018 :return: The table_variants attribute of the object. 1019 """ 1020 1021 # Access 1022 access = self.get_config().get("access", None) 1023 1024 # Clauses "select", "where", "update" 1025 if clause in ["select", "where", "update"]: 1026 table_variants = self.table_variants 1027 # Clause "from" 1028 elif clause in ["from"]: 1029 # For Read Only 1030 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1031 input_file = self.get_input() 1032 table_variants = f"'{input_file}' as variants" 1033 # For Read Write 1034 else: 1035 table_variants = f"{self.table_variants} as variants" 1036 else: 1037 table_variants = self.table_variants 1038 return table_variants 1039 1040 def get_tmp_dir(self) -> str: 1041 """ 1042 The function `get_tmp_dir` returns the temporary directory path based on configuration 1043 parameters or a default path. 1044 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1045 configuration, parameters, and a default value of "/tmp". 1046 """ 1047 1048 return get_tmp( 1049 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1050 ) 1051 1052 def get_connexion_type(self) -> str: 1053 """ 1054 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1055 1056 :return: The connexion type is being returned. 1057 """ 1058 return self.get_config().get("connexion_type", "memory") 1059 1060 def get_connexion(self): 1061 """ 1062 It returns the connection object 1063 1064 :return: The connection object. 1065 """ 1066 return self.conn 1067 1068 def close_connexion(self) -> None: 1069 """ 1070 This function closes the connection to the database. 1071 :return: The connection is being closed. 1072 """ 1073 return self.conn.close() 1074 1075 def get_header(self, type: str = "vcf"): 1076 """ 1077 This function returns the header of the VCF file as a list of strings 1078 1079 :param type: the type of header you want to get, defaults to vcf (optional) 1080 :return: The header of the vcf file. 1081 """ 1082 1083 if self.header_vcf: 1084 if type == "vcf": 1085 return self.header_vcf 1086 elif type == "list": 1087 return self.header_list 1088 else: 1089 if type == "vcf": 1090 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1091 return header 1092 elif type == "list": 1093 return vcf_required 1094 1095 def get_header_length(self, file: str = None) -> int: 1096 """ 1097 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1098 line. 1099 1100 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1101 header file. If this argument is provided, the function will read the header from the specified 1102 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1103 :type file: str 1104 :return: the length of the header list, excluding the #CHROM line. 1105 """ 1106 1107 if file: 1108 return len(self.read_vcf_header_file(file=file)) - 1 1109 elif self.get_header(type="list"): 1110 return len(self.get_header(type="list")) - 1 1111 else: 1112 return 0 1113 1114 def get_header_columns(self) -> str: 1115 """ 1116 This function returns the header list of a VCF 1117 1118 :return: The length of the header list. 1119 """ 1120 if self.get_header(): 1121 return self.get_header(type="list")[-1] 1122 else: 1123 return "" 1124 1125 def get_header_columns_as_list(self) -> list: 1126 """ 1127 This function returns the header list of a VCF 1128 1129 :return: The length of the header list. 1130 """ 1131 if self.get_header(): 1132 return self.get_header_columns().strip().split("\t") 1133 else: 1134 return [] 1135 1136 def get_header_columns_as_sql(self) -> str: 1137 """ 1138 This function retruns header length (without #CHROM line) 1139 1140 :return: The length of the header list. 1141 """ 1142 sql_column_list = [] 1143 for col in self.get_header_columns_as_list(): 1144 sql_column_list.append(f'"{col}"') 1145 return ",".join(sql_column_list) 1146 1147 def get_header_sample_list( 1148 self, check: bool = False, samples: list = None, samples_force: bool = False 1149 ) -> list: 1150 """ 1151 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1152 checking and filtering based on input parameters. 1153 1154 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1155 parameter that determines whether to check if the samples in the list are properly defined as 1156 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1157 list is defined as a, defaults to False 1158 :type check: bool (optional) 1159 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1160 allows you to specify a subset of samples from the header. If you provide a list of sample 1161 names, the function will check if each sample is defined in the header. If a sample is not found 1162 in the 1163 :type samples: list 1164 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1165 a boolean parameter that determines whether to force the function to return the sample list 1166 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1167 function will return the sample list without performing, defaults to False 1168 :type samples_force: bool (optional) 1169 :return: The function `get_header_sample_list` returns a list of samples based on the input 1170 parameters and conditions specified in the function. 1171 """ 1172 1173 # Init 1174 samples_list = [] 1175 1176 if samples is None: 1177 samples_list = self.header_vcf.samples 1178 else: 1179 samples_checked = [] 1180 for sample in samples: 1181 if sample in self.header_vcf.samples: 1182 samples_checked.append(sample) 1183 else: 1184 log.warning(f"Sample '{sample}' not defined in header") 1185 samples_list = samples_checked 1186 1187 # Force sample list without checking if is_genotype_column 1188 if samples_force: 1189 log.warning(f"Samples {samples_list} not checked if genotypes") 1190 return samples_list 1191 1192 if check: 1193 samples_checked = [] 1194 for sample in samples_list: 1195 if self.is_genotype_column(column=sample): 1196 samples_checked.append(sample) 1197 else: 1198 log.warning( 1199 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1200 ) 1201 samples_list = samples_checked 1202 1203 # Return samples list 1204 return samples_list 1205 1206 def is_genotype_column(self, column: str = None) -> bool: 1207 """ 1208 This function checks if a given column is a genotype column in a database. 1209 1210 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1211 represents the column name in a database table. This method checks if the specified column is a 1212 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1213 method of 1214 :type column: str 1215 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1216 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1217 column name and returns the result. If the `column` parameter is None, it returns False. 1218 """ 1219 1220 if column is not None: 1221 return Database(database=self.get_input()).is_genotype_column(column=column) 1222 else: 1223 return False 1224 1225 def get_verbose(self) -> bool: 1226 """ 1227 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1228 exist 1229 1230 :return: The value of the key "verbose" in the config dictionary. 1231 """ 1232 return self.get_config().get("verbose", False) 1233 1234 def get_connexion_format(self) -> str: 1235 """ 1236 It returns the connexion format of the object. 1237 :return: The connexion_format is being returned. 1238 """ 1239 connexion_format = self.connexion_format 1240 if connexion_format not in ["duckdb", "sqlite"]: 1241 log.error(f"Unknown connexion format {connexion_format}") 1242 raise ValueError(f"Unknown connexion format {connexion_format}") 1243 else: 1244 return connexion_format 1245 1246 def insert_file_to_table( 1247 self, 1248 file, 1249 columns: str, 1250 header_len: int = 0, 1251 sep: str = "\t", 1252 chunksize: int = 1000000, 1253 ) -> None: 1254 """ 1255 The function reads a file in chunks and inserts each chunk into a table based on the specified 1256 database format. 1257 1258 :param file: The `file` parameter is the file that you want to load into a table. It should be 1259 the path to the file on your system 1260 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1261 should contain the names of the columns in the table where the data will be inserted. The column 1262 names should be separated by commas within the string. For example, if you have columns named 1263 "id", "name 1264 :type columns: str 1265 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1266 the number of lines to skip at the beginning of the file before reading the actual data. This 1267 parameter allows you to skip any header information present in the file before processing the 1268 data, defaults to 0 1269 :type header_len: int (optional) 1270 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1271 separator character that is used in the file being read. In this case, the default separator is 1272 set to `\t`, which represents a tab character. You can change this parameter to a different 1273 separator character if, defaults to \t 1274 :type sep: str (optional) 1275 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1276 when processing the file in chunks. In the provided code snippet, the default value for 1277 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1278 to 1000000 1279 :type chunksize: int (optional) 1280 """ 1281 1282 # Config 1283 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1284 connexion_format = self.get_connexion_format() 1285 1286 log.debug("chunksize: " + str(chunksize)) 1287 1288 if chunksize: 1289 for chunk in pd.read_csv( 1290 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1291 ): 1292 if connexion_format in ["duckdb"]: 1293 sql_insert_into = ( 1294 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1295 ) 1296 self.conn.execute(sql_insert_into) 1297 elif connexion_format in ["sqlite"]: 1298 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1299 1300 def load_data( 1301 self, 1302 input_file: str = None, 1303 drop_variants_table: bool = False, 1304 sample_size: int = 20480, 1305 ) -> None: 1306 """ 1307 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1308 table before loading the data and specify a sample size. 1309 1310 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1311 table 1312 :type input_file: str 1313 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1314 determines whether the variants table should be dropped before loading the data. If set to 1315 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1316 not be dropped, defaults to False 1317 :type drop_variants_table: bool (optional) 1318 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1319 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1320 20480 1321 :type sample_size: int (optional) 1322 """ 1323 1324 log.info("Loading...") 1325 1326 # change input file 1327 if input_file: 1328 self.set_input(input_file) 1329 self.set_header() 1330 1331 # drop variants table 1332 if drop_variants_table: 1333 self.drop_variants_table() 1334 1335 # get table variants 1336 table_variants = self.get_table_variants() 1337 1338 # Access 1339 access = self.get_config().get("access", None) 1340 log.debug(f"access: {access}") 1341 1342 # Input format and compress 1343 input_format = self.get_input_format() 1344 input_compressed = self.get_input_compressed() 1345 log.debug(f"input_format: {input_format}") 1346 log.debug(f"input_compressed: {input_compressed}") 1347 1348 # input_compressed_format 1349 if input_compressed: 1350 input_compressed_format = "gzip" 1351 else: 1352 input_compressed_format = "none" 1353 log.debug(f"input_compressed_format: {input_compressed_format}") 1354 1355 # Connexion format 1356 connexion_format = self.get_connexion_format() 1357 1358 # Sample size 1359 if not sample_size: 1360 sample_size = -1 1361 log.debug(f"sample_size: {sample_size}") 1362 1363 # Load data 1364 log.debug(f"Load Data from {input_format}") 1365 1366 # DuckDB connexion 1367 if connexion_format in ["duckdb"]: 1368 1369 # Database already exists 1370 if self.input_format in ["db", "duckdb"]: 1371 1372 if connexion_format in ["duckdb"]: 1373 log.debug(f"Input file format '{self.input_format}' duckDB") 1374 else: 1375 log.error( 1376 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1377 ) 1378 raise ValueError( 1379 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1380 ) 1381 1382 # Load from existing database format 1383 else: 1384 1385 try: 1386 # Create Table or View 1387 database = Database(database=self.input) 1388 sql_from = database.get_sql_from(sample_size=sample_size) 1389 1390 if access in ["RO"]: 1391 sql_load = ( 1392 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1393 ) 1394 else: 1395 sql_load = ( 1396 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1397 ) 1398 self.conn.execute(sql_load) 1399 1400 except: 1401 # Format not available 1402 log.error(f"Input file format '{self.input_format}' not available") 1403 raise ValueError( 1404 f"Input file format '{self.input_format}' not available" 1405 ) 1406 1407 # SQLite connexion 1408 elif connexion_format in ["sqlite"] and input_format in [ 1409 "vcf", 1410 "tsv", 1411 "csv", 1412 "psv", 1413 ]: 1414 1415 # Main structure 1416 structure = { 1417 "#CHROM": "VARCHAR", 1418 "POS": "INTEGER", 1419 "ID": "VARCHAR", 1420 "REF": "VARCHAR", 1421 "ALT": "VARCHAR", 1422 "QUAL": "VARCHAR", 1423 "FILTER": "VARCHAR", 1424 "INFO": "VARCHAR", 1425 } 1426 1427 # Strcuture with samples 1428 structure_complete = structure 1429 if self.get_header_sample_list(): 1430 structure["FORMAT"] = "VARCHAR" 1431 for sample in self.get_header_sample_list(): 1432 structure_complete[sample] = "VARCHAR" 1433 1434 # Columns list for create and insert 1435 sql_create_table_columns = [] 1436 sql_create_table_columns_list = [] 1437 for column in structure_complete: 1438 column_type = structure_complete[column] 1439 sql_create_table_columns.append( 1440 f'"{column}" {column_type} default NULL' 1441 ) 1442 sql_create_table_columns_list.append(f'"{column}"') 1443 1444 # Create database 1445 log.debug(f"Create Table {table_variants}") 1446 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1447 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1448 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1449 self.conn.execute(sql_create_table) 1450 1451 # chunksize define length of file chunk load file 1452 chunksize = 100000 1453 1454 # delimiter 1455 delimiter = file_format_delimiters.get(input_format, "\t") 1456 1457 # Load the input file 1458 with open(self.input, "rt") as input_file: 1459 1460 # Use the appropriate file handler based on the input format 1461 if input_compressed: 1462 input_file = bgzf.open(self.input, "rt") 1463 if input_format in ["vcf"]: 1464 header_len = self.get_header_length() 1465 else: 1466 header_len = 0 1467 1468 # Insert the file contents into a table 1469 self.insert_file_to_table( 1470 input_file, 1471 columns=sql_create_table_columns_list_sql, 1472 header_len=header_len, 1473 sep=delimiter, 1474 chunksize=chunksize, 1475 ) 1476 1477 else: 1478 log.error( 1479 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1480 ) 1481 raise ValueError( 1482 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1483 ) 1484 1485 # Explode INFOS fields into table fields 1486 if self.get_explode_infos(): 1487 self.explode_infos( 1488 prefix=self.get_explode_infos_prefix(), 1489 fields=self.get_explode_infos_fields(), 1490 force=True, 1491 ) 1492 1493 # Create index after insertion 1494 self.create_indexes() 1495 1496 def get_explode_infos(self) -> bool: 1497 """ 1498 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1499 to False if it is not set. 1500 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1501 value. If the parameter is not present, it will return False. 1502 """ 1503 1504 return self.get_param().get("explode", {}).get("explode_infos", False) 1505 1506 def get_explode_infos_fields( 1507 self, 1508 explode_infos_fields: str = None, 1509 remove_fields_not_in_header: bool = False, 1510 ) -> list: 1511 """ 1512 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1513 the input parameter `explode_infos_fields`. 1514 1515 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1516 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1517 comma-separated list of field names to explode 1518 :type explode_infos_fields: str 1519 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1520 flag that determines whether to remove fields that are not present in the header. If it is set 1521 to `True`, any field that is not in the header will be excluded from the list of exploded 1522 information fields. If it is set to `, defaults to False 1523 :type remove_fields_not_in_header: bool (optional) 1524 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1525 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1526 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1527 Otherwise, it returns a list of exploded information fields after removing any spaces and 1528 splitting the string by commas. 1529 """ 1530 1531 # If no fields, get it in param 1532 if not explode_infos_fields: 1533 explode_infos_fields = ( 1534 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1535 ) 1536 1537 # If no fields, defined as all fields in header using keyword 1538 if not explode_infos_fields: 1539 explode_infos_fields = "*" 1540 1541 # If fields list not empty 1542 if explode_infos_fields: 1543 1544 # Input fields list 1545 if isinstance(explode_infos_fields, str): 1546 fields_input = explode_infos_fields.split(",") 1547 elif isinstance(explode_infos_fields, list): 1548 fields_input = explode_infos_fields 1549 else: 1550 fields_input = [] 1551 1552 # Fields list without * keyword 1553 fields_without_all = fields_input.copy() 1554 if "*".casefold() in (item.casefold() for item in fields_without_all): 1555 fields_without_all.remove("*") 1556 1557 # Fields in header 1558 fields_in_header = sorted(list(set(self.get_header().infos))) 1559 1560 # Construct list of fields 1561 fields_output = [] 1562 for field in fields_input: 1563 1564 # Strip field 1565 field = field.strip() 1566 1567 # format keyword * in regex 1568 if field.upper() in ["*"]: 1569 field = ".*" 1570 1571 # Find all fields with pattern 1572 r = re.compile(field) 1573 fields_search = sorted(list(filter(r.match, fields_in_header))) 1574 1575 # Remove fields input from search 1576 if field in fields_search: 1577 fields_search = [field] 1578 elif fields_search != [field]: 1579 fields_search = sorted( 1580 list(set(fields_search).difference(fields_input)) 1581 ) 1582 1583 # If field is not in header (avoid not well formatted header) 1584 if not fields_search and not remove_fields_not_in_header: 1585 fields_search = [field] 1586 1587 # Add found fields 1588 for new_field in fields_search: 1589 # Add field, if not already exists, and if it is in header (if asked) 1590 if ( 1591 new_field not in fields_output 1592 and ( 1593 not remove_fields_not_in_header 1594 or new_field in fields_in_header 1595 ) 1596 and new_field not in [".*"] 1597 ): 1598 fields_output.append(new_field) 1599 1600 return fields_output 1601 1602 else: 1603 1604 return [] 1605 1606 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1607 """ 1608 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1609 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1610 not provided. 1611 1612 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1613 prefix to be used for exploding or expanding information 1614 :type explode_infos_prefix: str 1615 :return: the value of the variable `explode_infos_prefix`. 1616 """ 1617 1618 if not explode_infos_prefix: 1619 explode_infos_prefix = ( 1620 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1621 ) 1622 1623 return explode_infos_prefix 1624 1625 def add_column( 1626 self, 1627 table_name, 1628 column_name, 1629 column_type, 1630 default_value=None, 1631 drop: bool = False, 1632 ) -> dict: 1633 """ 1634 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1635 doesn't already exist. 1636 1637 :param table_name: The name of the table to which you want to add a column 1638 :param column_name: The parameter "column_name" is the name of the column that you want to add 1639 to the table 1640 :param column_type: The `column_type` parameter specifies the data type of the column that you 1641 want to add to the table. It should be a string that represents the desired data type, such as 1642 "INTEGER", "TEXT", "REAL", etc 1643 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1644 default value for the newly added column. If a default value is provided, it will be assigned to 1645 the column for any existing rows that do not have a value for that column 1646 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1647 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1648 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1649 to False 1650 :type drop: bool (optional) 1651 :return: a boolean value indicating whether the column was successfully added to the table. 1652 """ 1653 1654 # added 1655 added = False 1656 dropped = False 1657 1658 # Check if the column already exists in the table 1659 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1660 columns = self.get_query_to_df(query).columns.tolist() 1661 if column_name.upper() in [c.upper() for c in columns]: 1662 log.debug( 1663 f"The {column_name} column already exists in the {table_name} table" 1664 ) 1665 if drop: 1666 self.drop_column(table_name=table_name, column_name=column_name) 1667 dropped = True 1668 else: 1669 return None 1670 else: 1671 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1672 1673 # Add column in table 1674 add_column_query = ( 1675 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1676 ) 1677 if default_value is not None: 1678 add_column_query += f" DEFAULT {default_value}" 1679 self.execute_query(add_column_query) 1680 added = not dropped 1681 log.debug( 1682 f"The {column_name} column was successfully added to the {table_name} table" 1683 ) 1684 1685 if added: 1686 added_column = { 1687 "table_name": table_name, 1688 "column_name": column_name, 1689 "column_type": column_type, 1690 "default_value": default_value, 1691 } 1692 else: 1693 added_column = None 1694 1695 return added_column 1696 1697 def drop_column( 1698 self, column: dict = None, table_name: str = None, column_name: str = None 1699 ) -> bool: 1700 """ 1701 The `drop_column` function drops a specified column from a given table in a database and returns 1702 True if the column was successfully dropped, and False if the column does not exist in the 1703 table. 1704 1705 :param column: The `column` parameter is a dictionary that contains information about the column 1706 you want to drop. It has two keys: 1707 :type column: dict 1708 :param table_name: The `table_name` parameter is the name of the table from which you want to 1709 drop a column 1710 :type table_name: str 1711 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1712 from the table 1713 :type column_name: str 1714 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1715 and False if the column does not exist in the table. 1716 """ 1717 1718 # Find column infos 1719 if column: 1720 if isinstance(column, dict): 1721 table_name = column.get("table_name", None) 1722 column_name = column.get("column_name", None) 1723 elif isinstance(column, str): 1724 table_name = self.get_table_variants() 1725 column_name = column 1726 else: 1727 table_name = None 1728 column_name = None 1729 1730 if not table_name and not column_name: 1731 return False 1732 1733 # Removed 1734 removed = False 1735 1736 # Check if the column already exists in the table 1737 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1738 columns = self.get_query_to_df(query).columns.tolist() 1739 if column_name in columns: 1740 log.debug(f"The {column_name} column exists in the {table_name} table") 1741 else: 1742 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1743 return False 1744 1745 # Add column in table # ALTER TABLE integers DROP k 1746 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1747 self.execute_query(add_column_query) 1748 removed = True 1749 log.debug( 1750 f"The {column_name} column was successfully dropped to the {table_name} table" 1751 ) 1752 1753 return removed 1754 1755 def explode_infos( 1756 self, 1757 prefix: str = None, 1758 create_index: bool = False, 1759 fields: list = None, 1760 force: bool = False, 1761 proccess_all_fields_together: bool = False, 1762 table: str = None, 1763 ) -> list: 1764 """ 1765 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1766 individual columns, returning a list of added columns. 1767 1768 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1769 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1770 `self.get_explode_infos_prefix()` as the prefix 1771 :type prefix: str 1772 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1773 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1774 `False`, indexes will not be created. The default value is `False`, defaults to False 1775 :type create_index: bool (optional) 1776 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1777 that you want to explode into individual columns. If this parameter is not provided, all INFO 1778 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1779 a list to the ` 1780 :type fields: list 1781 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1782 determines whether to drop and recreate a column if it already exists in the table. If `force` 1783 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1784 defaults to False 1785 :type force: bool (optional) 1786 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1787 flag that determines whether to process all the INFO fields together or individually. If set to 1788 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1789 be processed individually. The default value is, defaults to False 1790 :type proccess_all_fields_together: bool (optional) 1791 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1792 of the table where the exploded INFO fields will be added as individual columns. If you provide 1793 a value for the `table` parameter, the function will use that table name. If the `table` 1794 parameter is 1795 :type table: str 1796 :return: The `explode_infos` function returns a list of added columns. 1797 """ 1798 1799 # drop indexes 1800 self.drop_indexes() 1801 1802 # connexion format 1803 connexion_format = self.get_connexion_format() 1804 1805 # Access 1806 access = self.get_config().get("access", None) 1807 1808 # Added columns 1809 added_columns = [] 1810 1811 if access not in ["RO"]: 1812 1813 # prefix 1814 if prefix in [None, True] or not isinstance(prefix, str): 1815 if self.get_explode_infos_prefix() not in [None, True]: 1816 prefix = self.get_explode_infos_prefix() 1817 else: 1818 prefix = "INFO/" 1819 1820 # table variants 1821 if table is not None: 1822 table_variants = table 1823 else: 1824 table_variants = self.get_table_variants(clause="select") 1825 1826 # extra infos 1827 try: 1828 extra_infos = self.get_extra_infos() 1829 except: 1830 extra_infos = [] 1831 1832 # Header infos 1833 header_infos = self.get_header().infos 1834 1835 log.debug( 1836 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1837 ) 1838 1839 sql_info_alter_table_array = [] 1840 1841 # Info fields to check 1842 fields_list = list(header_infos) 1843 if fields: 1844 fields_list += fields 1845 fields_list = set(fields_list) 1846 1847 # If no fields 1848 if not fields: 1849 fields = [] 1850 1851 # Translate fields if patterns 1852 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1853 1854 for info in fields: 1855 1856 info_id_sql = prefix + info 1857 1858 if ( 1859 info in fields_list 1860 or prefix + info in fields_list 1861 or info in extra_infos 1862 ): 1863 1864 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1865 1866 if info in header_infos: 1867 info_type = header_infos[info].type 1868 info_num = header_infos[info].num 1869 else: 1870 info_type = "String" 1871 info_num = 0 1872 1873 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1874 if info_num != 1: 1875 type_sql = "VARCHAR" 1876 1877 # Add field 1878 added_column = self.add_column( 1879 table_name=table_variants, 1880 column_name=info_id_sql, 1881 column_type=type_sql, 1882 default_value="null", 1883 drop=force, 1884 ) 1885 1886 if added_column: 1887 added_columns.append(added_column) 1888 1889 if added_column or force: 1890 1891 # add field to index 1892 self.index_additionnal_fields.append(info_id_sql) 1893 1894 # Update field array 1895 if connexion_format in ["duckdb"]: 1896 update_info_field = f""" 1897 "{info_id_sql}" = 1898 CASE 1899 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1900 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1901 END 1902 """ 1903 elif connexion_format in ["sqlite"]: 1904 update_info_field = f""" 1905 "{info_id_sql}" = 1906 CASE 1907 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1908 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1909 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1910 END 1911 """ 1912 1913 sql_info_alter_table_array.append(update_info_field) 1914 1915 if sql_info_alter_table_array: 1916 1917 # By chromosomes 1918 try: 1919 chromosomes_list = list( 1920 self.get_query_to_df( 1921 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1922 )["#CHROM"] 1923 ) 1924 except: 1925 chromosomes_list = [None] 1926 1927 for chrom in chromosomes_list: 1928 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1929 1930 # Where clause 1931 where_clause = "" 1932 if chrom and len(chromosomes_list) > 1: 1933 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1934 1935 # Update table 1936 if proccess_all_fields_together: 1937 sql_info_alter_table_array_join = ", ".join( 1938 sql_info_alter_table_array 1939 ) 1940 if sql_info_alter_table_array_join: 1941 sql_info_alter_table = f""" 1942 UPDATE {table_variants} 1943 SET {sql_info_alter_table_array_join} 1944 {where_clause} 1945 """ 1946 log.debug( 1947 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1948 ) 1949 # log.debug(sql_info_alter_table) 1950 self.conn.execute(sql_info_alter_table) 1951 else: 1952 sql_info_alter_num = 0 1953 for sql_info_alter in sql_info_alter_table_array: 1954 sql_info_alter_num += 1 1955 sql_info_alter_table = f""" 1956 UPDATE {table_variants} 1957 SET {sql_info_alter} 1958 {where_clause} 1959 """ 1960 log.debug( 1961 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1962 ) 1963 # log.debug(sql_info_alter_table) 1964 self.conn.execute(sql_info_alter_table) 1965 1966 # create indexes 1967 if create_index: 1968 self.create_indexes() 1969 1970 return added_columns 1971 1972 def create_indexes(self) -> None: 1973 """ 1974 Create indexes on the table after insertion 1975 """ 1976 1977 # Access 1978 access = self.get_config().get("access", None) 1979 1980 # get table variants 1981 table_variants = self.get_table_variants("FROM") 1982 1983 if self.get_indexing() and access not in ["RO"]: 1984 # Create index 1985 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1986 self.conn.execute(sql_create_table_index) 1987 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1988 self.conn.execute(sql_create_table_index) 1989 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1990 self.conn.execute(sql_create_table_index) 1991 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1992 self.conn.execute(sql_create_table_index) 1993 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1994 self.conn.execute(sql_create_table_index) 1995 for field in self.index_additionnal_fields: 1996 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1997 self.conn.execute(sql_create_table_index) 1998 1999 def drop_indexes(self) -> None: 2000 """ 2001 Create indexes on the table after insertion 2002 """ 2003 2004 # Access 2005 access = self.get_config().get("access", None) 2006 2007 # get table variants 2008 table_variants = self.get_table_variants("FROM") 2009 2010 # Get database format 2011 connexion_format = self.get_connexion_format() 2012 2013 if access not in ["RO"]: 2014 if connexion_format in ["duckdb"]: 2015 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2016 elif connexion_format in ["sqlite"]: 2017 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2018 2019 list_indexes = self.conn.execute(sql_list_indexes) 2020 index_names = [row[0] for row in list_indexes.fetchall()] 2021 for index in index_names: 2022 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2023 self.conn.execute(sql_drop_table_index) 2024 2025 def read_vcf_header(self, f) -> list: 2026 """ 2027 It reads the header of a VCF file and returns a list of the header lines 2028 2029 :param f: the file object 2030 :return: The header lines of the VCF file. 2031 """ 2032 2033 header_list = [] 2034 for line in f: 2035 header_list.append(line) 2036 if line.startswith("#CHROM"): 2037 break 2038 return header_list 2039 2040 def read_vcf_header_file(self, file: str = None) -> list: 2041 """ 2042 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2043 uncompressed files. 2044 2045 :param file: The `file` parameter is a string that represents the path to the VCF header file 2046 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2047 default to `None` 2048 :type file: str 2049 :return: The function `read_vcf_header_file` returns a list. 2050 """ 2051 2052 if self.get_input_compressed(input_file=file): 2053 with bgzf.open(file, "rt") as f: 2054 return self.read_vcf_header(f=f) 2055 else: 2056 with open(file, "rt") as f: 2057 return self.read_vcf_header(f=f) 2058 2059 def execute_query(self, query: str): 2060 """ 2061 It takes a query as an argument, executes it, and returns the results 2062 2063 :param query: The query to be executed 2064 :return: The result of the query is being returned. 2065 """ 2066 if query: 2067 return self.conn.execute(query) # .fetchall() 2068 else: 2069 return None 2070 2071 def export_output( 2072 self, 2073 output_file: str | None = None, 2074 output_header: str | None = None, 2075 export_header: bool = True, 2076 query: str | None = None, 2077 parquet_partitions: list | None = None, 2078 chunk_size: int | None = None, 2079 threads: int | None = None, 2080 sort: bool = False, 2081 index: bool = False, 2082 order_by: str | None = None, 2083 ) -> bool: 2084 """ 2085 The `export_output` function exports data from a VCF file to a specified output file in various 2086 formats, including VCF, CSV, TSV, PSV, and Parquet. 2087 2088 :param output_file: The `output_file` parameter is a string that specifies the name of the 2089 output file to be generated by the function. This is where the exported data will be saved 2090 :type output_file: str 2091 :param output_header: The `output_header` parameter is a string that specifies the name of the 2092 file where the header of the VCF file will be exported. If this parameter is not provided, the 2093 header will be exported to a file with the same name as the `output_file` parameter, but with 2094 the extension " 2095 :type output_header: str 2096 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2097 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2098 True, the header will be exported to a file. If `export_header` is False, the header will not 2099 be, defaults to True, if output format is not VCF 2100 :type export_header: bool (optional) 2101 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2102 select specific data from the VCF file before exporting it. If provided, only the data that 2103 matches the query will be exported 2104 :type query: str 2105 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2106 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2107 organize data in a hierarchical directory structure based on the values of one or more columns. 2108 This can improve query performance when working with large datasets 2109 :type parquet_partitions: list 2110 :param chunk_size: The `chunk_size` parameter specifies the number of 2111 records in batch when exporting data in Parquet format. This parameter is used for 2112 partitioning the Parquet file into multiple files. 2113 :type chunk_size: int 2114 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2115 threads to be used during the export process. It determines the level of parallelism and can 2116 improve the performance of the export operation. If not provided, the function will use the 2117 default number of threads 2118 :type threads: int 2119 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2120 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2121 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2122 False 2123 :type sort: bool (optional) 2124 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2125 created on the output file. If `index` is True, an index will be created. If `index` is False, 2126 no index will be created. The default value is False, defaults to False 2127 :type index: bool (optional) 2128 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2129 sorting the output file. This parameter is only applicable when exporting data in VCF format 2130 :type order_by: str 2131 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2132 None if it doesn't. 2133 """ 2134 2135 # Log 2136 log.info("Exporting...") 2137 2138 # Full path 2139 output_file = full_path(output_file) 2140 output_header = full_path(output_header) 2141 2142 # Config 2143 config = self.get_config() 2144 2145 # Param 2146 param = self.get_param() 2147 2148 # Tmp files to remove 2149 tmp_to_remove = [] 2150 2151 # If no output, get it 2152 if not output_file: 2153 output_file = self.get_output() 2154 2155 # If not threads 2156 if not threads: 2157 threads = self.get_threads() 2158 2159 # Auto header name with extension 2160 if export_header or output_header: 2161 if not output_header: 2162 output_header = f"{output_file}.hdr" 2163 # Export header 2164 self.export_header(output_file=output_file) 2165 2166 # Switch off export header if VCF output 2167 output_file_type = get_file_format(output_file) 2168 if output_file_type in ["vcf"]: 2169 export_header = False 2170 tmp_to_remove.append(output_header) 2171 2172 # Chunk size 2173 if not chunk_size: 2174 chunk_size = config.get("chunk_size", None) 2175 2176 # Parquet partition 2177 if not parquet_partitions: 2178 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2179 if parquet_partitions and isinstance(parquet_partitions, str): 2180 parquet_partitions = parquet_partitions.split(",") 2181 2182 # Order by 2183 if not order_by: 2184 order_by = param.get("export", {}).get("order_by", "") 2185 2186 # Header in output 2187 header_in_output = param.get("export", {}).get("include_header", False) 2188 2189 # Database 2190 database_source = self.get_connexion() 2191 2192 # Connexion format 2193 connexion_format = self.get_connexion_format() 2194 2195 # Explode infos 2196 if self.get_explode_infos(): 2197 self.explode_infos( 2198 prefix=self.get_explode_infos_prefix(), 2199 fields=self.get_explode_infos_fields(), 2200 force=False, 2201 ) 2202 2203 # if connexion_format in ["sqlite"] or query: 2204 if connexion_format in ["sqlite"]: 2205 2206 # Export in Parquet 2207 random_tmp = "".join( 2208 random.choice(string.ascii_lowercase) for i in range(10) 2209 ) 2210 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2211 tmp_to_remove.append(database_source) 2212 2213 # Table Variants 2214 table_variants = self.get_table_variants() 2215 2216 # Create export query 2217 sql_query_export_subquery = f""" 2218 SELECT * FROM {table_variants} 2219 """ 2220 2221 # Write source file 2222 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2223 2224 # Create database 2225 database = Database( 2226 database=database_source, 2227 table="variants", 2228 header_file=output_header, 2229 conn_config=self.get_connexion_config(), 2230 ) 2231 2232 # Existing colomns header 2233 existing_columns_header = database.get_header_columns_from_database() 2234 2235 # Sample list 2236 get_samples = self.get_samples() 2237 get_samples_check = self.get_samples_check() 2238 samples_force = get_samples is not None 2239 sample_list = self.get_header_sample_list( 2240 check=get_samples_check, samples=get_samples, samples_force=samples_force 2241 ) 2242 2243 # Export file 2244 database.export( 2245 output_database=output_file, 2246 output_header=output_header, 2247 existing_columns_header=existing_columns_header, 2248 parquet_partitions=parquet_partitions, 2249 chunk_size=chunk_size, 2250 threads=threads, 2251 sort=sort, 2252 index=index, 2253 header_in_output=header_in_output, 2254 order_by=order_by, 2255 query=query, 2256 export_header=export_header, 2257 sample_list=sample_list, 2258 ) 2259 2260 # Remove 2261 remove_if_exists(tmp_to_remove) 2262 2263 return (os.path.exists(output_file) or None) and ( 2264 os.path.exists(output_file) or None 2265 ) 2266 2267 def get_extra_infos(self, table: str = None) -> list: 2268 """ 2269 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2270 in the header. 2271 2272 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2273 name of the table from which you want to retrieve the extra columns that are not present in the 2274 header. If the `table` parameter is not provided when calling the function, it will default to 2275 using the variants 2276 :type table: str 2277 :return: A list of columns that are in the specified table but not in the header of the table. 2278 """ 2279 2280 header_columns = [] 2281 2282 if not table: 2283 table = self.get_table_variants(clause="from") 2284 header_columns = self.get_header_columns() 2285 2286 # Check all columns in the database 2287 query = f""" SELECT * FROM {table} LIMIT 1 """ 2288 log.debug(f"query {query}") 2289 table_columns = self.get_query_to_df(query).columns.tolist() 2290 extra_columns = [] 2291 2292 # Construct extra infos (not in header) 2293 for column in table_columns: 2294 if column not in header_columns: 2295 extra_columns.append(column) 2296 2297 return extra_columns 2298 2299 def get_extra_infos_sql(self, table: str = None) -> str: 2300 """ 2301 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2302 by double quotes 2303 2304 :param table: The name of the table to get the extra infos from. If None, the default table is 2305 used 2306 :type table: str 2307 :return: A string of the extra infos 2308 """ 2309 2310 return ", ".join( 2311 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2312 ) 2313 2314 def export_header( 2315 self, 2316 header_name: str = None, 2317 output_file: str = None, 2318 output_file_ext: str = ".hdr", 2319 clean_header: bool = True, 2320 remove_chrom_line: bool = False, 2321 ) -> str: 2322 """ 2323 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2324 specified options, and writes it to a new file. 2325 2326 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2327 this parameter is not specified, the header will be written to the output file 2328 :type header_name: str 2329 :param output_file: The `output_file` parameter in the `export_header` function is used to 2330 specify the name of the output file where the header will be written. If this parameter is not 2331 provided, the header will be written to a temporary file 2332 :type output_file: str 2333 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2334 string that represents the extension of the output header file. By default, it is set to ".hdr" 2335 if not specified by the user. This extension will be appended to the `output_file` name to 2336 create the final, defaults to .hdr 2337 :type output_file_ext: str (optional) 2338 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2339 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2340 `True`, the function will clean the header by modifying certain lines based on a specific 2341 pattern. If `clean_header`, defaults to True 2342 :type clean_header: bool (optional) 2343 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2344 boolean flag that determines whether the #CHROM line should be removed from the header before 2345 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2346 defaults to False 2347 :type remove_chrom_line: bool (optional) 2348 :return: The function `export_header` returns the name of the temporary header file that is 2349 created. 2350 """ 2351 2352 if not header_name and not output_file: 2353 output_file = self.get_output() 2354 2355 if self.get_header(): 2356 2357 # Get header object 2358 header_obj = self.get_header() 2359 2360 # Create database 2361 db_for_header = Database(database=self.get_input()) 2362 2363 # Get real columns in the file 2364 db_header_columns = db_for_header.get_columns() 2365 2366 with tempfile.TemporaryDirectory() as tmpdir: 2367 2368 # Write header file 2369 header_file_tmp = os.path.join(tmpdir, "header") 2370 f = open(header_file_tmp, "w") 2371 vcf.Writer(f, header_obj) 2372 f.close() 2373 2374 # Replace #CHROM line with rel columns 2375 header_list = db_for_header.read_header_file( 2376 header_file=header_file_tmp 2377 ) 2378 header_list[-1] = "\t".join(db_header_columns) 2379 2380 # Remove CHROM line 2381 if remove_chrom_line: 2382 header_list.pop() 2383 2384 # Clean header 2385 if clean_header: 2386 header_list_clean = [] 2387 for head in header_list: 2388 # Clean head for malformed header 2389 head_clean = head 2390 head_clean = re.subn( 2391 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2392 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2393 head_clean, 2394 2, 2395 )[0] 2396 # Write header 2397 header_list_clean.append(head_clean) 2398 header_list = header_list_clean 2399 2400 tmp_header_name = output_file + output_file_ext 2401 2402 f = open(tmp_header_name, "w") 2403 for line in header_list: 2404 f.write(line) 2405 f.close() 2406 2407 return tmp_header_name 2408 2409 def export_variant_vcf( 2410 self, 2411 vcf_file, 2412 remove_info: bool = False, 2413 add_samples: bool = True, 2414 list_samples: list = [], 2415 where_clause: str = "", 2416 index: bool = False, 2417 threads: int | None = None, 2418 ) -> bool | None: 2419 """ 2420 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2421 remove INFO field, add samples, and control compression and indexing. 2422 2423 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2424 written to. It is the output file that will contain the filtered VCF data based on the specified 2425 parameters 2426 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2427 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2428 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2429 in, defaults to False 2430 :type remove_info: bool (optional) 2431 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2432 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2433 If set to False, the samples will be removed. The default value is True, defaults to True 2434 :type add_samples: bool (optional) 2435 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2436 in the output VCF file. By default, all samples will be included. If you provide a list of 2437 samples, only those samples will be included in the output file 2438 :type list_samples: list 2439 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2440 determines whether or not to create an index for the output VCF file. If `index` is set to 2441 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2442 :type index: bool (optional) 2443 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2444 number of threads to use for exporting the VCF file. It determines how many parallel threads 2445 will be used during the export process. More threads can potentially speed up the export process 2446 by utilizing multiple cores of the processor. If 2447 :type threads: int | None 2448 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2449 method with various parameters including the output file, query, threads, sort flag, and index 2450 flag. The `export_output` method is responsible for exporting the VCF data based on the 2451 specified parameters and configurations provided in the `export_variant_vcf` function. 2452 """ 2453 2454 # Config 2455 config = self.get_config() 2456 2457 # Extract VCF 2458 log.debug("Export VCF...") 2459 2460 # Table variants 2461 table_variants = self.get_table_variants() 2462 2463 # Threads 2464 if not threads: 2465 threads = self.get_threads() 2466 2467 # Info fields 2468 if remove_info: 2469 if not isinstance(remove_info, str): 2470 remove_info = "." 2471 info_field = f"""'{remove_info}' as INFO""" 2472 else: 2473 info_field = "INFO" 2474 2475 # Samples fields 2476 if add_samples: 2477 if not list_samples: 2478 list_samples = self.get_header_sample_list() 2479 if list_samples: 2480 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2481 else: 2482 samples_fields = "" 2483 log.debug(f"samples_fields: {samples_fields}") 2484 else: 2485 samples_fields = "" 2486 2487 # Where clause 2488 if where_clause is None: 2489 where_clause = "" 2490 2491 # Variants 2492 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2493 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2494 log.debug(f"sql_query_select={sql_query_select}") 2495 2496 return self.export_output( 2497 output_file=vcf_file, 2498 output_header=None, 2499 export_header=True, 2500 query=sql_query_select, 2501 parquet_partitions=None, 2502 chunk_size=config.get("chunk_size", None), 2503 threads=threads, 2504 sort=True, 2505 index=index, 2506 order_by=None, 2507 ) 2508 2509 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2510 """ 2511 It takes a list of commands and runs them in parallel using the number of threads specified 2512 2513 :param commands: A list of commands to run 2514 :param threads: The number of threads to use, defaults to 1 (optional) 2515 """ 2516 2517 run_parallel_commands(commands, threads) 2518 2519 def get_threads(self, default: int = 1) -> int: 2520 """ 2521 This function returns the number of threads to use for a job, with a default value of 1 if not 2522 specified. 2523 2524 :param default: The `default` parameter in the `get_threads` method is used to specify the 2525 default number of threads to use if no specific value is provided. If no value is provided for 2526 the `threads` parameter in the configuration or input parameters, the `default` value will be 2527 used, defaults to 1 2528 :type default: int (optional) 2529 :return: the number of threads to use for the current job. 2530 """ 2531 2532 # Config 2533 config = self.get_config() 2534 2535 # Param 2536 param = self.get_param() 2537 2538 # Input threads 2539 input_thread = param.get("threads", config.get("threads", None)) 2540 2541 # Check threads 2542 if not input_thread: 2543 threads = default 2544 elif int(input_thread) <= 0: 2545 threads = os.cpu_count() 2546 else: 2547 threads = int(input_thread) 2548 return threads 2549 2550 def get_memory(self, default: str = None) -> str: 2551 """ 2552 This function retrieves the memory value from parameters or configuration with a default value 2553 if not found. 2554 2555 :param default: The `get_memory` function takes in a default value as a string parameter. This 2556 default value is used as a fallback in case the `memory` parameter is not provided in the 2557 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2558 the function 2559 :type default: str 2560 :return: The `get_memory` function returns a string value representing the memory parameter. If 2561 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2562 return the default value provided as an argument to the function. 2563 """ 2564 2565 # Config 2566 config = self.get_config() 2567 2568 # Param 2569 param = self.get_param() 2570 2571 # Input threads 2572 input_memory = param.get("memory", config.get("memory", None)) 2573 2574 # Check threads 2575 if input_memory: 2576 memory = input_memory 2577 else: 2578 memory = default 2579 2580 return memory 2581 2582 def update_from_vcf(self, vcf_file: str) -> None: 2583 """ 2584 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2585 2586 :param vcf_file: the path to the VCF file 2587 """ 2588 2589 connexion_format = self.get_connexion_format() 2590 2591 if connexion_format in ["duckdb"]: 2592 self.update_from_vcf_duckdb(vcf_file) 2593 elif connexion_format in ["sqlite"]: 2594 self.update_from_vcf_sqlite(vcf_file) 2595 2596 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2597 """ 2598 It takes a VCF file and updates the INFO column of the variants table in the database with the 2599 INFO column of the VCF file 2600 2601 :param vcf_file: the path to the VCF file 2602 """ 2603 2604 # varaints table 2605 table_variants = self.get_table_variants() 2606 2607 # Loading VCF into temporaire table 2608 skip = self.get_header_length(file=vcf_file) 2609 vcf_df = pd.read_csv( 2610 vcf_file, 2611 sep="\t", 2612 engine="c", 2613 skiprows=skip, 2614 header=0, 2615 low_memory=False, 2616 ) 2617 sql_query_update = f""" 2618 UPDATE {table_variants} as table_variants 2619 SET INFO = concat( 2620 CASE 2621 WHEN INFO NOT IN ('', '.') 2622 THEN INFO 2623 ELSE '' 2624 END, 2625 ( 2626 SELECT 2627 concat( 2628 CASE 2629 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2630 THEN ';' 2631 ELSE '' 2632 END 2633 , 2634 CASE 2635 WHEN table_parquet.INFO NOT IN ('','.') 2636 THEN table_parquet.INFO 2637 ELSE '' 2638 END 2639 ) 2640 FROM vcf_df as table_parquet 2641 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2642 AND table_parquet.\"POS\" = table_variants.\"POS\" 2643 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2644 AND table_parquet.\"REF\" = table_variants.\"REF\" 2645 AND table_parquet.INFO NOT IN ('','.') 2646 ) 2647 ) 2648 ; 2649 """ 2650 self.conn.execute(sql_query_update) 2651 2652 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2653 """ 2654 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2655 table, then updates the INFO column of the variants table with the INFO column of the temporary 2656 table 2657 2658 :param vcf_file: The path to the VCF file you want to update the database with 2659 """ 2660 2661 # Create a temporary table for the VCF 2662 table_vcf = "tmp_vcf" 2663 sql_create = ( 2664 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2665 ) 2666 self.conn.execute(sql_create) 2667 2668 # Loading VCF into temporaire table 2669 vcf_df = pd.read_csv( 2670 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2671 ) 2672 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2673 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2674 2675 # Update table 'variants' with VCF data 2676 # warning: CONCAT as || operator 2677 sql_query_update = f""" 2678 UPDATE variants as table_variants 2679 SET INFO = CASE 2680 WHEN INFO NOT IN ('', '.') 2681 THEN INFO 2682 ELSE '' 2683 END || 2684 ( 2685 SELECT 2686 CASE 2687 WHEN table_variants.INFO NOT IN ('','.') 2688 AND table_vcf.INFO NOT IN ('','.') 2689 THEN ';' 2690 ELSE '' 2691 END || 2692 CASE 2693 WHEN table_vcf.INFO NOT IN ('','.') 2694 THEN table_vcf.INFO 2695 ELSE '' 2696 END 2697 FROM {table_vcf} as table_vcf 2698 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2699 AND table_vcf.\"POS\" = table_variants.\"POS\" 2700 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2701 AND table_vcf.\"REF\" = table_variants.\"REF\" 2702 ) 2703 """ 2704 self.conn.execute(sql_query_update) 2705 2706 # Drop temporary table 2707 sql_drop = f"DROP TABLE {table_vcf}" 2708 self.conn.execute(sql_drop) 2709 2710 def drop_variants_table(self) -> None: 2711 """ 2712 > This function drops the variants table 2713 """ 2714 2715 table_variants = self.get_table_variants() 2716 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2717 self.conn.execute(sql_table_variants) 2718 2719 def set_variant_id( 2720 self, variant_id_column: str = "variant_id", force: bool = None 2721 ) -> str: 2722 """ 2723 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2724 `#CHROM`, `POS`, `REF`, and `ALT` columns 2725 2726 :param variant_id_column: The name of the column to be created in the variants table, defaults 2727 to variant_id 2728 :type variant_id_column: str (optional) 2729 :param force: If True, the variant_id column will be created even if it already exists 2730 :type force: bool 2731 :return: The name of the column that contains the variant_id 2732 """ 2733 2734 # Assembly 2735 assembly = self.get_param().get( 2736 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2737 ) 2738 2739 # INFO/Tag prefix 2740 prefix = self.get_explode_infos_prefix() 2741 2742 # Explode INFO/SVTYPE 2743 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2744 2745 # variants table 2746 table_variants = self.get_table_variants() 2747 2748 # variant_id column 2749 if not variant_id_column: 2750 variant_id_column = "variant_id" 2751 2752 # Creta variant_id column 2753 if "variant_id" not in self.get_extra_infos() or force: 2754 2755 # Create column 2756 self.add_column( 2757 table_name=table_variants, 2758 column_name=variant_id_column, 2759 column_type="UBIGINT", 2760 default_value="0", 2761 ) 2762 2763 # Update column 2764 self.conn.execute( 2765 f""" 2766 UPDATE {table_variants} 2767 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2768 """ 2769 ) 2770 2771 # Remove added columns 2772 for added_column in added_columns: 2773 self.drop_column(column=added_column) 2774 2775 # return variant_id column name 2776 return variant_id_column 2777 2778 def get_variant_id_column( 2779 self, variant_id_column: str = "variant_id", force: bool = None 2780 ) -> str: 2781 """ 2782 This function returns the variant_id column name 2783 2784 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2785 defaults to variant_id 2786 :type variant_id_column: str (optional) 2787 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2788 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2789 if it is not already set, or if it is set 2790 :type force: bool 2791 :return: The variant_id column name. 2792 """ 2793 2794 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2795 2796 ### 2797 # Annotation 2798 ### 2799 2800 def scan_databases( 2801 self, 2802 database_formats: list = ["parquet"], 2803 database_releases: list = ["current"], 2804 ) -> dict: 2805 """ 2806 The function `scan_databases` scans for available databases based on specified formats and 2807 releases. 2808 2809 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2810 of the databases to be scanned. In this case, the accepted format is "parquet" 2811 :type database_formats: list ["parquet"] 2812 :param database_releases: The `database_releases` parameter is a list that specifies the 2813 releases of the databases to be scanned. In the provided function, the default value for 2814 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2815 databases that are in the "current" 2816 :type database_releases: list 2817 :return: The function `scan_databases` returns a dictionary containing information about 2818 databases that match the specified formats and releases. 2819 """ 2820 2821 # Config 2822 config = self.get_config() 2823 2824 # Param 2825 param = self.get_param() 2826 2827 # Param - Assembly 2828 assembly = param.get("assembly", config.get("assembly", None)) 2829 if not assembly: 2830 assembly = DEFAULT_ASSEMBLY 2831 log.warning(f"Default assembly '{assembly}'") 2832 2833 # Scan for availabled databases 2834 log.info( 2835 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2836 ) 2837 databases_infos_dict = databases_infos( 2838 database_folder_releases=database_releases, 2839 database_formats=database_formats, 2840 assembly=assembly, 2841 config=config, 2842 ) 2843 log.info( 2844 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2845 ) 2846 2847 return databases_infos_dict 2848 2849 def annotation(self) -> None: 2850 """ 2851 It annotates the VCF file with the annotations specified in the config file. 2852 """ 2853 2854 # Config 2855 config = self.get_config() 2856 2857 # Param 2858 param = self.get_param() 2859 2860 # Param - Assembly 2861 assembly = param.get("assembly", config.get("assembly", None)) 2862 if not assembly: 2863 assembly = DEFAULT_ASSEMBLY 2864 log.warning(f"Default assembly '{assembly}'") 2865 2866 # annotations databases folders 2867 annotations_databases = set( 2868 config.get("folders", {}) 2869 .get("databases", {}) 2870 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2871 + config.get("folders", {}) 2872 .get("databases", {}) 2873 .get("parquet", ["~/howard/databases/parquet/current"]) 2874 + config.get("folders", {}) 2875 .get("databases", {}) 2876 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2877 ) 2878 2879 # Get param annotations 2880 if param.get("annotations", None) and isinstance( 2881 param.get("annotations", None), str 2882 ): 2883 log.debug(param.get("annotations", None)) 2884 param_annotation_list = param.get("annotations").split(",") 2885 else: 2886 param_annotation_list = [] 2887 2888 # Each tools param 2889 if param.get("annotation_parquet", None) != None: 2890 log.debug( 2891 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2892 ) 2893 if isinstance(param.get("annotation_parquet", None), list): 2894 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2895 else: 2896 param_annotation_list.append(param.get("annotation_parquet")) 2897 if param.get("annotation_snpsift", None) != None: 2898 if isinstance(param.get("annotation_snpsift", None), list): 2899 param_annotation_list.append( 2900 "snpsift:" 2901 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2902 ) 2903 else: 2904 param_annotation_list.append( 2905 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2906 ) 2907 if param.get("annotation_snpeff", None) != None: 2908 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2909 if param.get("annotation_bcftools", None) != None: 2910 if isinstance(param.get("annotation_bcftools", None), list): 2911 param_annotation_list.append( 2912 "bcftools:" 2913 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2914 ) 2915 else: 2916 param_annotation_list.append( 2917 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2918 ) 2919 if param.get("annotation_annovar", None) != None: 2920 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2921 if param.get("annotation_exomiser", None) != None: 2922 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2923 if param.get("annotation_splice", None) != None: 2924 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2925 2926 # Merge param annotations list 2927 param["annotations"] = ",".join(param_annotation_list) 2928 2929 # debug 2930 log.debug(f"param_annotations={param['annotations']}") 2931 2932 if param.get("annotations"): 2933 2934 # Log 2935 # log.info("Annotations - Check annotation parameters") 2936 2937 if not "annotation" in param: 2938 param["annotation"] = {} 2939 2940 # List of annotations parameters 2941 annotations_list_input = {} 2942 if isinstance(param.get("annotations", None), str): 2943 annotation_file_list = [ 2944 value for value in param.get("annotations", "").split(",") 2945 ] 2946 for annotation_file in annotation_file_list: 2947 annotations_list_input[annotation_file] = {"INFO": None} 2948 else: 2949 annotations_list_input = param.get("annotations", {}) 2950 2951 log.info(f"Quick Annotations:") 2952 for annotation_key in list(annotations_list_input.keys()): 2953 log.info(f" {annotation_key}") 2954 2955 # List of annotations and associated fields 2956 annotations_list = {} 2957 2958 for annotation_file in annotations_list_input: 2959 2960 # Explode annotations if ALL 2961 if ( 2962 annotation_file.upper() == "ALL" 2963 or annotation_file.upper().startswith("ALL:") 2964 ): 2965 2966 # check ALL parameters (formats, releases) 2967 annotation_file_split = annotation_file.split(":") 2968 database_formats = "parquet" 2969 database_releases = "current" 2970 for annotation_file_option in annotation_file_split[1:]: 2971 database_all_options_split = annotation_file_option.split("=") 2972 if database_all_options_split[0] == "format": 2973 database_formats = database_all_options_split[1].split("+") 2974 if database_all_options_split[0] == "release": 2975 database_releases = database_all_options_split[1].split("+") 2976 2977 # Scan for availabled databases 2978 databases_infos_dict = self.scan_databases( 2979 database_formats=database_formats, 2980 database_releases=database_releases, 2981 ) 2982 2983 # Add found databases in annotation parameters 2984 for database_infos in databases_infos_dict.keys(): 2985 annotations_list[database_infos] = {"INFO": None} 2986 2987 else: 2988 annotations_list[annotation_file] = annotations_list_input[ 2989 annotation_file 2990 ] 2991 2992 # Check each databases 2993 if len(annotations_list): 2994 2995 log.info( 2996 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2997 ) 2998 2999 for annotation_file in annotations_list: 3000 3001 # Init 3002 annotations = annotations_list.get(annotation_file, None) 3003 3004 # Annotation snpEff 3005 if annotation_file.startswith("snpeff"): 3006 3007 log.debug(f"Quick Annotation snpEff") 3008 3009 if "snpeff" not in param["annotation"]: 3010 param["annotation"]["snpeff"] = {} 3011 3012 if "options" not in param["annotation"]["snpeff"]: 3013 param["annotation"]["snpeff"]["options"] = "" 3014 3015 # snpEff options in annotations 3016 param["annotation"]["snpeff"]["options"] = "".join( 3017 annotation_file.split(":")[1:] 3018 ) 3019 3020 # Annotation Annovar 3021 elif annotation_file.startswith("annovar"): 3022 3023 log.debug(f"Quick Annotation Annovar") 3024 3025 if "annovar" not in param["annotation"]: 3026 param["annotation"]["annovar"] = {} 3027 3028 if "annotations" not in param["annotation"]["annovar"]: 3029 param["annotation"]["annovar"]["annotations"] = {} 3030 3031 # Options 3032 annotation_file_split = annotation_file.split(":") 3033 for annotation_file_annotation in annotation_file_split[1:]: 3034 if annotation_file_annotation: 3035 param["annotation"]["annovar"]["annotations"][ 3036 annotation_file_annotation 3037 ] = annotations 3038 3039 # Annotation Exomiser 3040 elif annotation_file.startswith("exomiser"): 3041 3042 log.debug(f"Quick Annotation Exomiser") 3043 3044 param["annotation"]["exomiser"] = params_string_to_dict( 3045 annotation_file 3046 ) 3047 3048 # Annotation Splice 3049 elif annotation_file.startswith("splice"): 3050 3051 log.debug(f"Quick Annotation Splice") 3052 3053 param["annotation"]["splice"] = params_string_to_dict( 3054 annotation_file 3055 ) 3056 3057 # Annotation Parquet or BCFTOOLS 3058 else: 3059 3060 # Tools detection 3061 if annotation_file.startswith("bcftools:"): 3062 annotation_tool_initial = "bcftools" 3063 annotation_file = ":".join(annotation_file.split(":")[1:]) 3064 elif annotation_file.startswith("snpsift:"): 3065 annotation_tool_initial = "snpsift" 3066 annotation_file = ":".join(annotation_file.split(":")[1:]) 3067 else: 3068 annotation_tool_initial = None 3069 3070 # list of files 3071 annotation_file_list = annotation_file.replace("+", ":").split( 3072 ":" 3073 ) 3074 3075 for annotation_file in annotation_file_list: 3076 3077 if annotation_file: 3078 3079 # Annotation tool initial 3080 annotation_tool = annotation_tool_initial 3081 3082 # Find file 3083 annotation_file_found = None 3084 3085 # Expand user 3086 annotation_file = full_path(annotation_file) 3087 3088 if os.path.exists(annotation_file): 3089 annotation_file_found = annotation_file 3090 3091 else: 3092 # Find within assembly folders 3093 for annotations_database in annotations_databases: 3094 found_files = find_all( 3095 annotation_file, 3096 os.path.join( 3097 annotations_database, assembly 3098 ), 3099 ) 3100 if len(found_files) > 0: 3101 annotation_file_found = found_files[0] 3102 break 3103 if not annotation_file_found and not assembly: 3104 # Find within folders 3105 for ( 3106 annotations_database 3107 ) in annotations_databases: 3108 found_files = find_all( 3109 annotation_file, annotations_database 3110 ) 3111 if len(found_files) > 0: 3112 annotation_file_found = found_files[0] 3113 break 3114 log.debug( 3115 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3116 ) 3117 3118 # Full path 3119 annotation_file_found = full_path(annotation_file_found) 3120 3121 if annotation_file_found: 3122 3123 database = Database(database=annotation_file_found) 3124 quick_annotation_format = database.get_format() 3125 quick_annotation_is_compressed = ( 3126 database.is_compressed() 3127 ) 3128 quick_annotation_is_indexed = os.path.exists( 3129 f"{annotation_file_found}.tbi" 3130 ) 3131 bcftools_preference = False 3132 3133 # Check Annotation Tool 3134 if not annotation_tool: 3135 if ( 3136 bcftools_preference 3137 and quick_annotation_format 3138 in ["vcf", "bed"] 3139 and quick_annotation_is_compressed 3140 and quick_annotation_is_indexed 3141 ): 3142 annotation_tool = "bcftools" 3143 elif quick_annotation_format in [ 3144 "vcf", 3145 "bed", 3146 "tsv", 3147 "tsv", 3148 "csv", 3149 "json", 3150 "tbl", 3151 "parquet", 3152 "duckdb", 3153 ]: 3154 annotation_tool = "parquet" 3155 else: 3156 log.error( 3157 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3158 ) 3159 raise ValueError( 3160 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3161 ) 3162 3163 log.debug( 3164 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3165 ) 3166 3167 # Annotation Tool dispatch 3168 if annotation_tool: 3169 if annotation_tool not in param["annotation"]: 3170 param["annotation"][annotation_tool] = {} 3171 if ( 3172 "annotations" 3173 not in param["annotation"][annotation_tool] 3174 ): 3175 param["annotation"][annotation_tool][ 3176 "annotations" 3177 ] = {} 3178 param["annotation"][annotation_tool][ 3179 "annotations" 3180 ][annotation_file_found] = annotations 3181 3182 else: 3183 log.error( 3184 f"Quick Annotation File {annotation_file} does NOT exist" 3185 ) 3186 3187 self.set_param(param) 3188 3189 if param.get("annotation", None): 3190 log.info("Annotations") 3191 if param.get("annotation", {}).get("parquet", None): 3192 log.info("Annotations 'parquet'...") 3193 self.annotation_parquet() 3194 if param.get("annotation", {}).get("bcftools", None): 3195 log.info("Annotations 'bcftools'...") 3196 self.annotation_bcftools() 3197 if param.get("annotation", {}).get("snpsift", None): 3198 log.info("Annotations 'snpsift'...") 3199 self.annotation_snpsift() 3200 if param.get("annotation", {}).get("annovar", None): 3201 log.info("Annotations 'annovar'...") 3202 self.annotation_annovar() 3203 if param.get("annotation", {}).get("snpeff", None): 3204 log.info("Annotations 'snpeff'...") 3205 self.annotation_snpeff() 3206 if param.get("annotation", {}).get("exomiser", None) is not None: 3207 log.info("Annotations 'exomiser'...") 3208 self.annotation_exomiser() 3209 if param.get("annotation", {}).get("splice", None) is not None: 3210 log.info("Annotations 'splice' ...") 3211 self.annotation_splice() 3212 3213 # Explode INFOS fields into table fields 3214 if self.get_explode_infos(): 3215 self.explode_infos( 3216 prefix=self.get_explode_infos_prefix(), 3217 fields=self.get_explode_infos_fields(), 3218 force=True, 3219 ) 3220 3221 def annotation_snpsift(self, threads: int = None) -> None: 3222 """ 3223 This function annotate with bcftools 3224 3225 :param threads: Number of threads to use 3226 :return: the value of the variable "return_value". 3227 """ 3228 3229 # DEBUG 3230 log.debug("Start annotation with bcftools databases") 3231 3232 # Threads 3233 if not threads: 3234 threads = self.get_threads() 3235 log.debug("Threads: " + str(threads)) 3236 3237 # Config 3238 config = self.get_config() 3239 log.debug("Config: " + str(config)) 3240 3241 # Config - snpSift 3242 snpsift_bin_command = get_bin_command( 3243 bin="SnpSift.jar", 3244 tool="snpsift", 3245 bin_type="jar", 3246 config=config, 3247 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3248 ) 3249 if not snpsift_bin_command: 3250 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3251 log.error(msg_err) 3252 raise ValueError(msg_err) 3253 3254 # Config - bcftools 3255 bcftools_bin_command = get_bin_command( 3256 bin="bcftools", 3257 tool="bcftools", 3258 bin_type="bin", 3259 config=config, 3260 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3261 ) 3262 if not bcftools_bin_command: 3263 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3264 log.error(msg_err) 3265 raise ValueError(msg_err) 3266 3267 # Config - BCFTools databases folders 3268 databases_folders = set( 3269 self.get_config() 3270 .get("folders", {}) 3271 .get("databases", {}) 3272 .get("annotations", ["."]) 3273 + self.get_config() 3274 .get("folders", {}) 3275 .get("databases", {}) 3276 .get("bcftools", ["."]) 3277 ) 3278 log.debug("Databases annotations: " + str(databases_folders)) 3279 3280 # Param 3281 annotations = ( 3282 self.get_param() 3283 .get("annotation", {}) 3284 .get("snpsift", {}) 3285 .get("annotations", None) 3286 ) 3287 log.debug("Annotations: " + str(annotations)) 3288 3289 # Assembly 3290 assembly = self.get_param().get( 3291 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3292 ) 3293 3294 # Data 3295 table_variants = self.get_table_variants() 3296 3297 # Check if not empty 3298 log.debug("Check if not empty") 3299 sql_query_chromosomes = ( 3300 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3301 ) 3302 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3303 if not sql_query_chromosomes_df["count"][0]: 3304 log.info(f"VCF empty") 3305 return 3306 3307 # VCF header 3308 vcf_reader = self.get_header() 3309 log.debug("Initial header: " + str(vcf_reader.infos)) 3310 3311 # Existing annotations 3312 for vcf_annotation in self.get_header().infos: 3313 3314 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3315 log.debug( 3316 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3317 ) 3318 3319 if annotations: 3320 3321 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3322 3323 # Export VCF file 3324 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3325 3326 # Init 3327 commands = {} 3328 3329 for annotation in annotations: 3330 annotation_fields = annotations[annotation] 3331 3332 # Annotation Name 3333 annotation_name = os.path.basename(annotation) 3334 3335 if not annotation_fields: 3336 annotation_fields = {"INFO": None} 3337 3338 log.debug(f"Annotation '{annotation_name}'") 3339 log.debug( 3340 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3341 ) 3342 3343 # Create Database 3344 database = Database( 3345 database=annotation, 3346 databases_folders=databases_folders, 3347 assembly=assembly, 3348 ) 3349 3350 # Find files 3351 db_file = database.get_database() 3352 db_file = full_path(db_file) 3353 db_hdr_file = database.get_header_file() 3354 db_hdr_file = full_path(db_hdr_file) 3355 db_file_type = database.get_format() 3356 db_tbi_file = f"{db_file}.tbi" 3357 db_file_compressed = database.is_compressed() 3358 3359 # Check if compressed 3360 if not db_file_compressed: 3361 log.error( 3362 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3363 ) 3364 raise ValueError( 3365 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3366 ) 3367 3368 # Check if indexed 3369 if not os.path.exists(db_tbi_file): 3370 log.error( 3371 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3372 ) 3373 raise ValueError( 3374 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3375 ) 3376 3377 # Check index - try to create if not exists 3378 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3379 log.error("Annotation failed: database not valid") 3380 log.error(f"Annotation annotation file: {db_file}") 3381 log.error(f"Annotation annotation header: {db_hdr_file}") 3382 log.error(f"Annotation annotation index: {db_tbi_file}") 3383 raise ValueError( 3384 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3385 ) 3386 else: 3387 3388 log.debug( 3389 f"Annotation '{annotation}' - file: " 3390 + str(db_file) 3391 + " and " 3392 + str(db_hdr_file) 3393 ) 3394 3395 # Load header as VCF object 3396 db_hdr_vcf = Variants(input=db_hdr_file) 3397 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3398 log.debug( 3399 "Annotation database header: " 3400 + str(db_hdr_vcf_header_infos) 3401 ) 3402 3403 # For all fields in database 3404 annotation_fields_full = False 3405 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3406 annotation_fields = { 3407 key: key for key in db_hdr_vcf_header_infos 3408 } 3409 log.debug( 3410 "Annotation database header - All annotations added: " 3411 + str(annotation_fields) 3412 ) 3413 annotation_fields_full = True 3414 3415 # # Create file for field rename 3416 # log.debug("Create file for field rename") 3417 # tmp_rename = NamedTemporaryFile( 3418 # prefix=self.get_prefix(), 3419 # dir=self.get_tmp_dir(), 3420 # suffix=".rename", 3421 # delete=False, 3422 # ) 3423 # tmp_rename_name = tmp_rename.name 3424 # tmp_files.append(tmp_rename_name) 3425 3426 # Number of fields 3427 nb_annotation_field = 0 3428 annotation_list = [] 3429 annotation_infos_rename_list = [] 3430 3431 for annotation_field in annotation_fields: 3432 3433 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3434 annotation_fields_new_name = annotation_fields.get( 3435 annotation_field, annotation_field 3436 ) 3437 if not annotation_fields_new_name: 3438 annotation_fields_new_name = annotation_field 3439 3440 # Check if field is in DB and if field is not elready in input data 3441 if ( 3442 annotation_field in db_hdr_vcf.get_header().infos 3443 and annotation_fields_new_name 3444 not in self.get_header().infos 3445 ): 3446 3447 log.info( 3448 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3449 ) 3450 3451 # BCFTools annotate param to rename fields 3452 if annotation_field != annotation_fields_new_name: 3453 annotation_infos_rename_list.append( 3454 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3455 ) 3456 3457 # Add INFO field to header 3458 db_hdr_vcf_header_infos_number = ( 3459 db_hdr_vcf_header_infos[annotation_field].num or "." 3460 ) 3461 db_hdr_vcf_header_infos_type = ( 3462 db_hdr_vcf_header_infos[annotation_field].type 3463 or "String" 3464 ) 3465 db_hdr_vcf_header_infos_description = ( 3466 db_hdr_vcf_header_infos[annotation_field].desc 3467 or f"{annotation_field} description" 3468 ) 3469 db_hdr_vcf_header_infos_source = ( 3470 db_hdr_vcf_header_infos[annotation_field].source 3471 or "unknown" 3472 ) 3473 db_hdr_vcf_header_infos_version = ( 3474 db_hdr_vcf_header_infos[annotation_field].version 3475 or "unknown" 3476 ) 3477 3478 vcf_reader.infos[annotation_fields_new_name] = ( 3479 vcf.parser._Info( 3480 annotation_fields_new_name, 3481 db_hdr_vcf_header_infos_number, 3482 db_hdr_vcf_header_infos_type, 3483 db_hdr_vcf_header_infos_description, 3484 db_hdr_vcf_header_infos_source, 3485 db_hdr_vcf_header_infos_version, 3486 self.code_type_map[ 3487 db_hdr_vcf_header_infos_type 3488 ], 3489 ) 3490 ) 3491 3492 annotation_list.append(annotation_field) 3493 3494 nb_annotation_field += 1 3495 3496 else: 3497 3498 if ( 3499 annotation_field 3500 not in db_hdr_vcf.get_header().infos 3501 ): 3502 log.warning( 3503 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3504 ) 3505 if ( 3506 annotation_fields_new_name 3507 in self.get_header().infos 3508 ): 3509 log.warning( 3510 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3511 ) 3512 3513 log.info( 3514 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3515 ) 3516 3517 annotation_infos = ",".join(annotation_list) 3518 3519 if annotation_infos != "": 3520 3521 # Annotated VCF (and error file) 3522 tmp_annotation_vcf_name = os.path.join( 3523 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3524 ) 3525 tmp_annotation_vcf_name_err = ( 3526 tmp_annotation_vcf_name + ".err" 3527 ) 3528 3529 # Add fields to annotate 3530 if not annotation_fields_full: 3531 annotation_infos_option = f"-info {annotation_infos}" 3532 else: 3533 annotation_infos_option = "" 3534 3535 # Info fields rename 3536 if annotation_infos_rename_list: 3537 annotation_infos_rename = " -c " + ",".join( 3538 annotation_infos_rename_list 3539 ) 3540 else: 3541 annotation_infos_rename = "" 3542 3543 # Annotate command 3544 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3545 3546 # Add command 3547 commands[command_annotate] = tmp_annotation_vcf_name 3548 3549 if commands: 3550 3551 # Export VCF file 3552 self.export_variant_vcf( 3553 vcf_file=tmp_vcf_name, 3554 remove_info=True, 3555 add_samples=False, 3556 index=True, 3557 ) 3558 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3559 3560 # Num command 3561 nb_command = 0 3562 3563 # Annotate 3564 for command_annotate in commands: 3565 nb_command += 1 3566 log.info( 3567 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3568 ) 3569 log.debug(f"command_annotate={command_annotate}") 3570 run_parallel_commands([command_annotate], threads) 3571 3572 # Debug 3573 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3574 3575 # Update variants 3576 log.info( 3577 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3578 ) 3579 self.update_from_vcf(commands[command_annotate]) 3580 3581 def annotation_bcftools(self, threads: int = None) -> None: 3582 """ 3583 This function annotate with bcftools 3584 3585 :param threads: Number of threads to use 3586 :return: the value of the variable "return_value". 3587 """ 3588 3589 # DEBUG 3590 log.debug("Start annotation with bcftools databases") 3591 3592 # Threads 3593 if not threads: 3594 threads = self.get_threads() 3595 log.debug("Threads: " + str(threads)) 3596 3597 # Config 3598 config = self.get_config() 3599 log.debug("Config: " + str(config)) 3600 3601 # DEBUG 3602 delete_tmp = True 3603 if self.get_config().get("verbosity", "warning") in ["debug"]: 3604 delete_tmp = False 3605 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3606 3607 # Config - BCFTools bin command 3608 bcftools_bin_command = get_bin_command( 3609 bin="bcftools", 3610 tool="bcftools", 3611 bin_type="bin", 3612 config=config, 3613 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3614 ) 3615 if not bcftools_bin_command: 3616 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3617 log.error(msg_err) 3618 raise ValueError(msg_err) 3619 3620 # Config - BCFTools databases folders 3621 databases_folders = set( 3622 self.get_config() 3623 .get("folders", {}) 3624 .get("databases", {}) 3625 .get("annotations", ["."]) 3626 + self.get_config() 3627 .get("folders", {}) 3628 .get("databases", {}) 3629 .get("bcftools", ["."]) 3630 ) 3631 log.debug("Databases annotations: " + str(databases_folders)) 3632 3633 # Param 3634 annotations = ( 3635 self.get_param() 3636 .get("annotation", {}) 3637 .get("bcftools", {}) 3638 .get("annotations", None) 3639 ) 3640 log.debug("Annotations: " + str(annotations)) 3641 3642 # Assembly 3643 assembly = self.get_param().get( 3644 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3645 ) 3646 3647 # Data 3648 table_variants = self.get_table_variants() 3649 3650 # Check if not empty 3651 log.debug("Check if not empty") 3652 sql_query_chromosomes = ( 3653 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3654 ) 3655 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3656 if not sql_query_chromosomes_df["count"][0]: 3657 log.info(f"VCF empty") 3658 return 3659 3660 # Export in VCF 3661 log.debug("Create initial file to annotate") 3662 tmp_vcf = NamedTemporaryFile( 3663 prefix=self.get_prefix(), 3664 dir=self.get_tmp_dir(), 3665 suffix=".vcf.gz", 3666 delete=False, 3667 ) 3668 tmp_vcf_name = tmp_vcf.name 3669 3670 # VCF header 3671 vcf_reader = self.get_header() 3672 log.debug("Initial header: " + str(vcf_reader.infos)) 3673 3674 # Existing annotations 3675 for vcf_annotation in self.get_header().infos: 3676 3677 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3678 log.debug( 3679 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3680 ) 3681 3682 if annotations: 3683 3684 tmp_ann_vcf_list = [] 3685 commands = [] 3686 tmp_files = [] 3687 err_files = [] 3688 3689 for annotation in annotations: 3690 annotation_fields = annotations[annotation] 3691 3692 # Annotation Name 3693 annotation_name = os.path.basename(annotation) 3694 3695 if not annotation_fields: 3696 annotation_fields = {"INFO": None} 3697 3698 log.debug(f"Annotation '{annotation_name}'") 3699 log.debug( 3700 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3701 ) 3702 3703 # Create Database 3704 database = Database( 3705 database=annotation, 3706 databases_folders=databases_folders, 3707 assembly=assembly, 3708 ) 3709 3710 # Find files 3711 db_file = database.get_database() 3712 db_file = full_path(db_file) 3713 db_hdr_file = database.get_header_file() 3714 db_hdr_file = full_path(db_hdr_file) 3715 db_file_type = database.get_format() 3716 db_tbi_file = f"{db_file}.tbi" 3717 db_file_compressed = database.is_compressed() 3718 3719 # Check if compressed 3720 if not db_file_compressed: 3721 log.error( 3722 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3723 ) 3724 raise ValueError( 3725 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3726 ) 3727 3728 # Check if indexed 3729 if not os.path.exists(db_tbi_file): 3730 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3731 raise ValueError( 3732 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3733 ) 3734 3735 # Check index - try to create if not exists 3736 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3737 log.error("Annotation failed: database not valid") 3738 log.error(f"Annotation annotation file: {db_file}") 3739 log.error(f"Annotation annotation header: {db_hdr_file}") 3740 log.error(f"Annotation annotation index: {db_tbi_file}") 3741 raise ValueError( 3742 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3743 ) 3744 else: 3745 3746 log.debug( 3747 f"Annotation '{annotation}' - file: " 3748 + str(db_file) 3749 + " and " 3750 + str(db_hdr_file) 3751 ) 3752 3753 # Load header as VCF object 3754 db_hdr_vcf = Variants(input=db_hdr_file) 3755 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3756 log.debug( 3757 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3758 ) 3759 3760 # For all fields in database 3761 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3762 annotation_fields = { 3763 key: key for key in db_hdr_vcf_header_infos 3764 } 3765 log.debug( 3766 "Annotation database header - All annotations added: " 3767 + str(annotation_fields) 3768 ) 3769 3770 # Number of fields 3771 nb_annotation_field = 0 3772 annotation_list = [] 3773 3774 for annotation_field in annotation_fields: 3775 3776 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3777 annotation_fields_new_name = annotation_fields.get( 3778 annotation_field, annotation_field 3779 ) 3780 if not annotation_fields_new_name: 3781 annotation_fields_new_name = annotation_field 3782 3783 # Check if field is in DB and if field is not elready in input data 3784 if ( 3785 annotation_field in db_hdr_vcf.get_header().infos 3786 and annotation_fields_new_name 3787 not in self.get_header().infos 3788 ): 3789 3790 log.info( 3791 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3792 ) 3793 3794 # Add INFO field to header 3795 db_hdr_vcf_header_infos_number = ( 3796 db_hdr_vcf_header_infos[annotation_field].num or "." 3797 ) 3798 db_hdr_vcf_header_infos_type = ( 3799 db_hdr_vcf_header_infos[annotation_field].type 3800 or "String" 3801 ) 3802 db_hdr_vcf_header_infos_description = ( 3803 db_hdr_vcf_header_infos[annotation_field].desc 3804 or f"{annotation_field} description" 3805 ) 3806 db_hdr_vcf_header_infos_source = ( 3807 db_hdr_vcf_header_infos[annotation_field].source 3808 or "unknown" 3809 ) 3810 db_hdr_vcf_header_infos_version = ( 3811 db_hdr_vcf_header_infos[annotation_field].version 3812 or "unknown" 3813 ) 3814 3815 vcf_reader.infos[annotation_fields_new_name] = ( 3816 vcf.parser._Info( 3817 annotation_fields_new_name, 3818 db_hdr_vcf_header_infos_number, 3819 db_hdr_vcf_header_infos_type, 3820 db_hdr_vcf_header_infos_description, 3821 db_hdr_vcf_header_infos_source, 3822 db_hdr_vcf_header_infos_version, 3823 self.code_type_map[db_hdr_vcf_header_infos_type], 3824 ) 3825 ) 3826 3827 # annotation_list.append(annotation_field) 3828 if annotation_field != annotation_fields_new_name: 3829 annotation_list.append( 3830 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3831 ) 3832 else: 3833 annotation_list.append(annotation_field) 3834 3835 nb_annotation_field += 1 3836 3837 else: 3838 3839 if annotation_field not in db_hdr_vcf.get_header().infos: 3840 log.warning( 3841 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3842 ) 3843 if annotation_fields_new_name in self.get_header().infos: 3844 log.warning( 3845 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3846 ) 3847 3848 log.info( 3849 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3850 ) 3851 3852 annotation_infos = ",".join(annotation_list) 3853 3854 if annotation_infos != "": 3855 3856 # Protect header for bcftools (remove "#CHROM" and variants line) 3857 log.debug("Protect Header file - remove #CHROM line if exists") 3858 tmp_header_vcf = NamedTemporaryFile( 3859 prefix=self.get_prefix(), 3860 dir=self.get_tmp_dir(), 3861 suffix=".hdr", 3862 delete=False, 3863 ) 3864 tmp_header_vcf_name = tmp_header_vcf.name 3865 tmp_files.append(tmp_header_vcf_name) 3866 # Command 3867 if db_hdr_file.endswith(".gz"): 3868 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3869 else: 3870 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3871 # Run 3872 run_parallel_commands([command_extract_header], 1) 3873 3874 # Find chomosomes 3875 log.debug("Find chromosomes ") 3876 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3877 sql_query_chromosomes_df = self.get_query_to_df( 3878 sql_query_chromosomes 3879 ) 3880 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3881 3882 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3883 3884 # BED columns in the annotation file 3885 if db_file_type in ["bed"]: 3886 annotation_infos = "CHROM,POS,POS," + annotation_infos 3887 3888 for chrom in chomosomes_list: 3889 3890 # Create BED on initial VCF 3891 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3892 tmp_bed = NamedTemporaryFile( 3893 prefix=self.get_prefix(), 3894 dir=self.get_tmp_dir(), 3895 suffix=".bed", 3896 delete=False, 3897 ) 3898 tmp_bed_name = tmp_bed.name 3899 tmp_files.append(tmp_bed_name) 3900 3901 # Detecte regions 3902 log.debug( 3903 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3904 ) 3905 window = 1000000 3906 sql_query_intervals_for_bed = f""" 3907 SELECT \"#CHROM\", 3908 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3909 \"POS\"+{window} 3910 FROM {table_variants} as table_variants 3911 WHERE table_variants.\"#CHROM\" = '{chrom}' 3912 """ 3913 regions = self.conn.execute( 3914 sql_query_intervals_for_bed 3915 ).fetchall() 3916 merged_regions = merge_regions(regions) 3917 log.debug( 3918 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3919 ) 3920 3921 header = ["#CHROM", "START", "END"] 3922 with open(tmp_bed_name, "w") as f: 3923 # Write the header with tab delimiter 3924 f.write("\t".join(header) + "\n") 3925 for d in merged_regions: 3926 # Write each data row with tab delimiter 3927 f.write("\t".join(map(str, d)) + "\n") 3928 3929 # Tmp files 3930 tmp_annotation_vcf = NamedTemporaryFile( 3931 prefix=self.get_prefix(), 3932 dir=self.get_tmp_dir(), 3933 suffix=".vcf.gz", 3934 delete=False, 3935 ) 3936 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3937 tmp_files.append(tmp_annotation_vcf_name) 3938 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3939 tmp_annotation_vcf_name_err = ( 3940 tmp_annotation_vcf_name + ".err" 3941 ) 3942 err_files.append(tmp_annotation_vcf_name_err) 3943 3944 # Annotate Command 3945 log.debug( 3946 f"Annotation '{annotation}' - add bcftools command" 3947 ) 3948 3949 # Command 3950 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3951 3952 # Add command 3953 commands.append(command_annotate) 3954 3955 # if some commands 3956 if commands: 3957 3958 # Export VCF file 3959 self.export_variant_vcf( 3960 vcf_file=tmp_vcf_name, 3961 remove_info=True, 3962 add_samples=False, 3963 index=True, 3964 ) 3965 3966 # Threads 3967 # calculate threads for annotated commands 3968 if commands: 3969 threads_bcftools_annotate = round(threads / len(commands)) 3970 else: 3971 threads_bcftools_annotate = 1 3972 3973 if not threads_bcftools_annotate: 3974 threads_bcftools_annotate = 1 3975 3976 # Add threads option to bcftools commands 3977 if threads_bcftools_annotate > 1: 3978 commands_threaded = [] 3979 for command in commands: 3980 commands_threaded.append( 3981 command.replace( 3982 f"{bcftools_bin_command} annotate ", 3983 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3984 ) 3985 ) 3986 commands = commands_threaded 3987 3988 # Command annotation multithreading 3989 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3990 log.info( 3991 f"Annotation - Annotation multithreaded in " 3992 + str(len(commands)) 3993 + " commands" 3994 ) 3995 3996 run_parallel_commands(commands, threads) 3997 3998 # Merge 3999 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4000 4001 if tmp_ann_vcf_list_cmd: 4002 4003 # Tmp file 4004 tmp_annotate_vcf = NamedTemporaryFile( 4005 prefix=self.get_prefix(), 4006 dir=self.get_tmp_dir(), 4007 suffix=".vcf.gz", 4008 delete=True, 4009 ) 4010 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4011 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4012 err_files.append(tmp_annotate_vcf_name_err) 4013 4014 # Tmp file remove command 4015 tmp_files_remove_command = "" 4016 if tmp_files: 4017 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4018 4019 # Command merge 4020 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4021 log.info( 4022 f"Annotation - Annotation merging " 4023 + str(len(commands)) 4024 + " annotated files" 4025 ) 4026 log.debug(f"Annotation - merge command: {merge_command}") 4027 run_parallel_commands([merge_command], 1) 4028 4029 # Error messages 4030 log.info(f"Error/Warning messages:") 4031 error_message_command_all = [] 4032 error_message_command_warning = [] 4033 error_message_command_err = [] 4034 for err_file in err_files: 4035 with open(err_file, "r") as f: 4036 for line in f: 4037 message = line.strip() 4038 error_message_command_all.append(message) 4039 if line.startswith("[W::"): 4040 error_message_command_warning.append(message) 4041 if line.startswith("[E::"): 4042 error_message_command_err.append( 4043 f"{err_file}: " + message 4044 ) 4045 # log info 4046 for message in list( 4047 set(error_message_command_err + error_message_command_warning) 4048 ): 4049 log.info(f" {message}") 4050 # debug info 4051 for message in list(set(error_message_command_all)): 4052 log.debug(f" {message}") 4053 # failed 4054 if len(error_message_command_err): 4055 log.error("Annotation failed: Error in commands") 4056 raise ValueError("Annotation failed: Error in commands") 4057 4058 # Update variants 4059 log.info(f"Annotation - Updating...") 4060 self.update_from_vcf(tmp_annotate_vcf_name) 4061 4062 def annotation_exomiser(self, threads: int = None) -> None: 4063 """ 4064 This function annotate with Exomiser 4065 4066 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4067 - "analysis" (dict/file): 4068 Full analysis dictionnary parameters (see Exomiser docs). 4069 Either a dict, or a file in JSON or YAML format. 4070 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4071 Default : None 4072 - "preset" (string): 4073 Analysis preset (available in config folder). 4074 Used if no full "analysis" is provided. 4075 Default: "exome" 4076 - "phenopacket" (dict/file): 4077 Samples and phenotipic features parameters (see Exomiser docs). 4078 Either a dict, or a file in JSON or YAML format. 4079 Default: None 4080 - "subject" (dict): 4081 Sample parameters (see Exomiser docs). 4082 Example: 4083 "subject": 4084 { 4085 "id": "ISDBM322017", 4086 "sex": "FEMALE" 4087 } 4088 Default: None 4089 - "sample" (string): 4090 Sample name to construct "subject" section: 4091 "subject": 4092 { 4093 "id": "<sample>", 4094 "sex": "UNKNOWN_SEX" 4095 } 4096 Default: None 4097 - "phenotypicFeatures" (dict) 4098 Phenotypic features to construct "subject" section. 4099 Example: 4100 "phenotypicFeatures": 4101 [ 4102 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4103 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4104 ] 4105 - "hpo" (list) 4106 List of HPO ids as phenotypic features. 4107 Example: 4108 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4109 Default: [] 4110 - "outputOptions" (dict): 4111 Output options (see Exomiser docs). 4112 Default: 4113 "output_options" = 4114 { 4115 "outputContributingVariantsOnly": False, 4116 "numGenes": 0, 4117 "outputFormats": ["TSV_VARIANT", "VCF"] 4118 } 4119 - "transcript_source" (string): 4120 Transcript source (either "refseq", "ucsc", "ensembl") 4121 Default: "refseq" 4122 - "exomiser_to_info" (boolean): 4123 Add exomiser TSV file columns as INFO fields in VCF. 4124 Default: False 4125 - "release" (string): 4126 Exomise database release. 4127 If not exists, database release will be downloaded (take a while). 4128 Default: None (provided by application.properties configuration file) 4129 - "exomiser_application_properties" (file): 4130 Exomiser configuration file (see Exomiser docs). 4131 Useful to automatically download databases (especially for specific genome databases). 4132 4133 Notes: 4134 - If no sample in parameters, first sample in VCF will be chosen 4135 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4136 4137 :param threads: The number of threads to use 4138 :return: None. 4139 """ 4140 4141 # DEBUG 4142 log.debug("Start annotation with Exomiser databases") 4143 4144 # Threads 4145 if not threads: 4146 threads = self.get_threads() 4147 log.debug("Threads: " + str(threads)) 4148 4149 # Config 4150 config = self.get_config() 4151 log.debug("Config: " + str(config)) 4152 4153 # Config - Folders - Databases 4154 databases_folders = ( 4155 config.get("folders", {}) 4156 .get("databases", {}) 4157 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4158 ) 4159 databases_folders = full_path(databases_folders) 4160 if not os.path.exists(databases_folders): 4161 log.error(f"Databases annotations: {databases_folders} NOT found") 4162 log.debug("Databases annotations: " + str(databases_folders)) 4163 4164 # Config - Exomiser 4165 exomiser_bin_command = get_bin_command( 4166 bin="exomiser-cli*.jar", 4167 tool="exomiser", 4168 bin_type="jar", 4169 config=config, 4170 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4171 ) 4172 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4173 if not exomiser_bin_command: 4174 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4175 log.error(msg_err) 4176 raise ValueError(msg_err) 4177 4178 # Param 4179 param = self.get_param() 4180 log.debug("Param: " + str(param)) 4181 4182 # Param - Exomiser 4183 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4184 log.debug(f"Param Exomiser: {param_exomiser}") 4185 4186 # Param - Assembly 4187 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4188 log.debug("Assembly: " + str(assembly)) 4189 4190 # Data 4191 table_variants = self.get_table_variants() 4192 4193 # Check if not empty 4194 log.debug("Check if not empty") 4195 sql_query_chromosomes = ( 4196 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4197 ) 4198 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4199 log.info(f"VCF empty") 4200 return False 4201 4202 # VCF header 4203 vcf_reader = self.get_header() 4204 log.debug("Initial header: " + str(vcf_reader.infos)) 4205 4206 # Samples 4207 samples = self.get_header_sample_list() 4208 if not samples: 4209 log.error("No Samples in VCF") 4210 return False 4211 log.debug(f"Samples: {samples}") 4212 4213 # Memory limit 4214 memory_limit = self.get_memory("8G") 4215 log.debug(f"memory_limit: {memory_limit}") 4216 4217 # Exomiser java options 4218 exomiser_java_options = ( 4219 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4220 ) 4221 log.debug(f"Exomiser java options: {exomiser_java_options}") 4222 4223 # Download Exomiser (if not exists) 4224 exomiser_release = param_exomiser.get("release", None) 4225 exomiser_application_properties = param_exomiser.get( 4226 "exomiser_application_properties", None 4227 ) 4228 databases_download_exomiser( 4229 assemblies=[assembly], 4230 exomiser_folder=databases_folders, 4231 exomiser_release=exomiser_release, 4232 exomiser_phenotype_release=exomiser_release, 4233 exomiser_application_properties=exomiser_application_properties, 4234 ) 4235 4236 # Force annotation 4237 force_update_annotation = True 4238 4239 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4240 log.debug("Start annotation Exomiser") 4241 4242 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4243 4244 # tmp_dir = "/tmp/exomiser" 4245 4246 ### ANALYSIS ### 4247 ################ 4248 4249 # Create analysis.json through analysis dict 4250 # either analysis in param or by default 4251 # depending on preset exome/genome) 4252 4253 # Init analysis dict 4254 param_exomiser_analysis_dict = {} 4255 4256 # analysis from param 4257 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4258 param_exomiser_analysis = full_path(param_exomiser_analysis) 4259 4260 # If analysis in param -> load anlaysis json 4261 if param_exomiser_analysis: 4262 4263 # If param analysis is a file and exists 4264 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4265 param_exomiser_analysis 4266 ): 4267 # Load analysis file into analysis dict (either yaml or json) 4268 with open(param_exomiser_analysis) as json_file: 4269 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4270 4271 # If param analysis is a dict 4272 elif isinstance(param_exomiser_analysis, dict): 4273 # Load analysis dict into analysis dict (either yaml or json) 4274 param_exomiser_analysis_dict = param_exomiser_analysis 4275 4276 # Error analysis type 4277 else: 4278 log.error(f"Analysis type unknown. Check param file.") 4279 raise ValueError(f"Analysis type unknown. Check param file.") 4280 4281 # Case no input analysis config file/dict 4282 # Use preset (exome/genome) to open default config file 4283 if not param_exomiser_analysis_dict: 4284 4285 # default preset 4286 default_preset = "exome" 4287 4288 # Get param preset or default preset 4289 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4290 4291 # Try to find if preset is a file 4292 if os.path.exists(param_exomiser_preset): 4293 # Preset file is provided in full path 4294 param_exomiser_analysis_default_config_file = ( 4295 param_exomiser_preset 4296 ) 4297 # elif os.path.exists(full_path(param_exomiser_preset)): 4298 # # Preset file is provided in full path 4299 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4300 elif os.path.exists( 4301 os.path.join(folder_config, param_exomiser_preset) 4302 ): 4303 # Preset file is provided a basename in config folder (can be a path with subfolders) 4304 param_exomiser_analysis_default_config_file = os.path.join( 4305 folder_config, param_exomiser_preset 4306 ) 4307 else: 4308 # Construct preset file 4309 param_exomiser_analysis_default_config_file = os.path.join( 4310 folder_config, 4311 f"preset-{param_exomiser_preset}-analysis.json", 4312 ) 4313 4314 # If preset file exists 4315 param_exomiser_analysis_default_config_file = full_path( 4316 param_exomiser_analysis_default_config_file 4317 ) 4318 if os.path.exists(param_exomiser_analysis_default_config_file): 4319 # Load prest file into analysis dict (either yaml or json) 4320 with open( 4321 param_exomiser_analysis_default_config_file 4322 ) as json_file: 4323 # param_exomiser_analysis_dict[""] = json.load(json_file) 4324 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4325 json_file 4326 ) 4327 4328 # Error preset file 4329 else: 4330 log.error( 4331 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4332 ) 4333 raise ValueError( 4334 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4335 ) 4336 4337 # If no analysis dict created 4338 if not param_exomiser_analysis_dict: 4339 log.error(f"No analysis config") 4340 raise ValueError(f"No analysis config") 4341 4342 # Log 4343 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4344 4345 ### PHENOPACKET ### 4346 ################### 4347 4348 # If no PhenoPacket in analysis dict -> check in param 4349 if "phenopacket" not in param_exomiser_analysis_dict: 4350 4351 # If PhenoPacket in param -> load anlaysis json 4352 if param_exomiser.get("phenopacket", None): 4353 4354 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4355 param_exomiser_phenopacket = full_path( 4356 param_exomiser_phenopacket 4357 ) 4358 4359 # If param phenopacket is a file and exists 4360 if isinstance( 4361 param_exomiser_phenopacket, str 4362 ) and os.path.exists(param_exomiser_phenopacket): 4363 # Load phenopacket file into analysis dict (either yaml or json) 4364 with open(param_exomiser_phenopacket) as json_file: 4365 param_exomiser_analysis_dict["phenopacket"] = ( 4366 yaml.safe_load(json_file) 4367 ) 4368 4369 # If param phenopacket is a dict 4370 elif isinstance(param_exomiser_phenopacket, dict): 4371 # Load phenopacket dict into analysis dict (either yaml or json) 4372 param_exomiser_analysis_dict["phenopacket"] = ( 4373 param_exomiser_phenopacket 4374 ) 4375 4376 # Error phenopacket type 4377 else: 4378 log.error(f"Phenopacket type unknown. Check param file.") 4379 raise ValueError( 4380 f"Phenopacket type unknown. Check param file." 4381 ) 4382 4383 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4384 if "phenopacket" not in param_exomiser_analysis_dict: 4385 4386 # Init PhenoPacket 4387 param_exomiser_analysis_dict["phenopacket"] = { 4388 "id": "analysis", 4389 "proband": {}, 4390 } 4391 4392 ### Add subject ### 4393 4394 # If subject exists 4395 param_exomiser_subject = param_exomiser.get("subject", {}) 4396 4397 # If subject not exists -> found sample ID 4398 if not param_exomiser_subject: 4399 4400 # Found sample ID in param 4401 sample = param_exomiser.get("sample", None) 4402 4403 # Find sample ID (first sample) 4404 if not sample: 4405 sample_list = self.get_header_sample_list() 4406 if len(sample_list) > 0: 4407 sample = sample_list[0] 4408 else: 4409 log.error(f"No sample found") 4410 raise ValueError(f"No sample found") 4411 4412 # Create subject 4413 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4414 4415 # Add to dict 4416 param_exomiser_analysis_dict["phenopacket"][ 4417 "subject" 4418 ] = param_exomiser_subject 4419 4420 ### Add "phenotypicFeatures" ### 4421 4422 # If phenotypicFeatures exists 4423 param_exomiser_phenotypicfeatures = param_exomiser.get( 4424 "phenotypicFeatures", [] 4425 ) 4426 4427 # If phenotypicFeatures not exists -> Try to infer from hpo list 4428 if not param_exomiser_phenotypicfeatures: 4429 4430 # Found HPO in param 4431 param_exomiser_hpo = param_exomiser.get("hpo", []) 4432 4433 # Split HPO if list in string format separated by comma 4434 if isinstance(param_exomiser_hpo, str): 4435 param_exomiser_hpo = param_exomiser_hpo.split(",") 4436 4437 # Create HPO list 4438 for hpo in param_exomiser_hpo: 4439 hpo_clean = re.sub("[^0-9]", "", hpo) 4440 param_exomiser_phenotypicfeatures.append( 4441 { 4442 "type": { 4443 "id": f"HP:{hpo_clean}", 4444 "label": f"HP:{hpo_clean}", 4445 } 4446 } 4447 ) 4448 4449 # Add to dict 4450 param_exomiser_analysis_dict["phenopacket"][ 4451 "phenotypicFeatures" 4452 ] = param_exomiser_phenotypicfeatures 4453 4454 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4455 if not param_exomiser_phenotypicfeatures: 4456 for step in param_exomiser_analysis_dict.get( 4457 "analysis", {} 4458 ).get("steps", []): 4459 if "hiPhivePrioritiser" in step: 4460 param_exomiser_analysis_dict.get("analysis", {}).get( 4461 "steps", [] 4462 ).remove(step) 4463 4464 ### Add Input File ### 4465 4466 # Initial file name and htsFiles 4467 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4468 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4469 { 4470 "uri": tmp_vcf_name, 4471 "htsFormat": "VCF", 4472 "genomeAssembly": assembly, 4473 } 4474 ] 4475 4476 ### Add metaData ### 4477 4478 # If metaData not in analysis dict 4479 if "metaData" not in param_exomiser_analysis_dict: 4480 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4481 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4482 "createdBy": "howard", 4483 "phenopacketSchemaVersion": 1, 4484 } 4485 4486 ### OutputOptions ### 4487 4488 # Init output result folder 4489 output_results = os.path.join(tmp_dir, "results") 4490 4491 # If no outputOptions in analysis dict 4492 if "outputOptions" not in param_exomiser_analysis_dict: 4493 4494 # default output formats 4495 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4496 4497 # Get outputOptions in param 4498 output_options = param_exomiser.get("outputOptions", None) 4499 4500 # If no output_options in param -> check 4501 if not output_options: 4502 output_options = { 4503 "outputContributingVariantsOnly": False, 4504 "numGenes": 0, 4505 "outputFormats": defaut_output_formats, 4506 } 4507 4508 # Replace outputDirectory in output options 4509 output_options["outputDirectory"] = output_results 4510 output_options["outputFileName"] = "howard" 4511 4512 # Add outputOptions in analysis dict 4513 param_exomiser_analysis_dict["outputOptions"] = output_options 4514 4515 else: 4516 4517 # Replace output_results and output format (if exists in param) 4518 param_exomiser_analysis_dict["outputOptions"][ 4519 "outputDirectory" 4520 ] = output_results 4521 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4522 list( 4523 set( 4524 param_exomiser_analysis_dict.get( 4525 "outputOptions", {} 4526 ).get("outputFormats", []) 4527 + ["TSV_VARIANT", "VCF"] 4528 ) 4529 ) 4530 ) 4531 4532 # log 4533 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4534 4535 ### ANALYSIS FILE ### 4536 ##################### 4537 4538 ### Full JSON analysis config file ### 4539 4540 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4541 with open(exomiser_analysis, "w") as fp: 4542 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4543 4544 ### SPLIT analysis and sample config files 4545 4546 # Splitted analysis dict 4547 param_exomiser_analysis_dict_for_split = ( 4548 param_exomiser_analysis_dict.copy() 4549 ) 4550 4551 # Phenopacket JSON file 4552 exomiser_analysis_phenopacket = os.path.join( 4553 tmp_dir, "analysis_phenopacket.json" 4554 ) 4555 with open(exomiser_analysis_phenopacket, "w") as fp: 4556 json.dump( 4557 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4558 fp, 4559 indent=4, 4560 ) 4561 4562 # Analysis JSON file without Phenopacket parameters 4563 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4564 exomiser_analysis_analysis = os.path.join( 4565 tmp_dir, "analysis_analysis.json" 4566 ) 4567 with open(exomiser_analysis_analysis, "w") as fp: 4568 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4569 4570 ### INITAL VCF file ### 4571 ####################### 4572 4573 ### Create list of samples to use and include inti initial VCF file #### 4574 4575 # Subject (main sample) 4576 # Get sample ID in analysis dict 4577 sample_subject = ( 4578 param_exomiser_analysis_dict.get("phenopacket", {}) 4579 .get("subject", {}) 4580 .get("id", None) 4581 ) 4582 sample_proband = ( 4583 param_exomiser_analysis_dict.get("phenopacket", {}) 4584 .get("proband", {}) 4585 .get("subject", {}) 4586 .get("id", None) 4587 ) 4588 sample = [] 4589 if sample_subject: 4590 sample.append(sample_subject) 4591 if sample_proband: 4592 sample.append(sample_proband) 4593 4594 # Get sample ID within Pedigree 4595 pedigree_persons_list = ( 4596 param_exomiser_analysis_dict.get("phenopacket", {}) 4597 .get("pedigree", {}) 4598 .get("persons", {}) 4599 ) 4600 4601 # Create list with all sample ID in pedigree (if exists) 4602 pedigree_persons = [] 4603 for person in pedigree_persons_list: 4604 pedigree_persons.append(person.get("individualId")) 4605 4606 # Concat subject sample ID and samples ID in pedigreesamples 4607 samples = list(set(sample + pedigree_persons)) 4608 4609 # Check if sample list is not empty 4610 if not samples: 4611 log.error(f"No samples found") 4612 raise ValueError(f"No samples found") 4613 4614 # Create VCF with sample (either sample in param or first one by default) 4615 # Export VCF file 4616 self.export_variant_vcf( 4617 vcf_file=tmp_vcf_name, 4618 remove_info=True, 4619 add_samples=True, 4620 list_samples=samples, 4621 index=False, 4622 ) 4623 4624 ### Execute Exomiser ### 4625 ######################## 4626 4627 # Init command 4628 exomiser_command = "" 4629 4630 # Command exomiser options 4631 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4632 4633 # Release 4634 exomiser_release = param_exomiser.get("release", None) 4635 if exomiser_release: 4636 # phenotype data version 4637 exomiser_options += ( 4638 f" --exomiser.phenotype.data-version={exomiser_release} " 4639 ) 4640 # data version 4641 exomiser_options += ( 4642 f" --exomiser.{assembly}.data-version={exomiser_release} " 4643 ) 4644 # variant white list 4645 variant_white_list_file = ( 4646 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4647 ) 4648 if os.path.exists( 4649 os.path.join( 4650 databases_folders, assembly, variant_white_list_file 4651 ) 4652 ): 4653 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4654 4655 # transcript_source 4656 transcript_source = param_exomiser.get( 4657 "transcript_source", None 4658 ) # ucsc, refseq, ensembl 4659 if transcript_source: 4660 exomiser_options += ( 4661 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4662 ) 4663 4664 # If analysis contain proband param 4665 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4666 "proband", {} 4667 ): 4668 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4669 4670 # If no proband (usually uniq sample) 4671 else: 4672 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4673 4674 # Log 4675 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4676 4677 # Run command 4678 result = subprocess.call( 4679 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4680 ) 4681 if result: 4682 log.error("Exomiser command failed") 4683 raise ValueError("Exomiser command failed") 4684 4685 ### RESULTS ### 4686 ############### 4687 4688 ### Annotate with TSV fields ### 4689 4690 # Init result tsv file 4691 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4692 4693 # Init result tsv file 4694 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4695 4696 # Parse TSV file and explode columns in INFO field 4697 if exomiser_to_info and os.path.exists(output_results_tsv): 4698 4699 # Log 4700 log.debug("Exomiser columns to VCF INFO field") 4701 4702 # Retrieve columns and types 4703 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4704 output_results_tsv_df = self.get_query_to_df(query) 4705 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4706 4707 # Init concat fields for update 4708 sql_query_update_concat_fields = [] 4709 4710 # Fields to avoid 4711 fields_to_avoid = [ 4712 "CONTIG", 4713 "START", 4714 "END", 4715 "REF", 4716 "ALT", 4717 "QUAL", 4718 "FILTER", 4719 "GENOTYPE", 4720 ] 4721 4722 # List all columns to add into header 4723 for header_column in output_results_tsv_columns: 4724 4725 # If header column is enable 4726 if header_column not in fields_to_avoid: 4727 4728 # Header info type 4729 header_info_type = "String" 4730 header_column_df = output_results_tsv_df[header_column] 4731 header_column_df_dtype = header_column_df.dtype 4732 if header_column_df_dtype == object: 4733 if ( 4734 pd.to_numeric(header_column_df, errors="coerce") 4735 .notnull() 4736 .all() 4737 ): 4738 header_info_type = "Float" 4739 else: 4740 header_info_type = "Integer" 4741 4742 # Header info 4743 characters_to_validate = ["-"] 4744 pattern = "[" + "".join(characters_to_validate) + "]" 4745 header_info_name = re.sub( 4746 pattern, 4747 "_", 4748 f"Exomiser_{header_column}".replace("#", ""), 4749 ) 4750 header_info_number = "." 4751 header_info_description = ( 4752 f"Exomiser {header_column} annotation" 4753 ) 4754 header_info_source = "Exomiser" 4755 header_info_version = "unknown" 4756 header_info_code = CODE_TYPE_MAP[header_info_type] 4757 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4758 header_info_name, 4759 header_info_number, 4760 header_info_type, 4761 header_info_description, 4762 header_info_source, 4763 header_info_version, 4764 header_info_code, 4765 ) 4766 4767 # Add field to add for update to concat fields 4768 sql_query_update_concat_fields.append( 4769 f""" 4770 CASE 4771 WHEN table_parquet."{header_column}" NOT IN ('','.') 4772 THEN concat( 4773 '{header_info_name}=', 4774 table_parquet."{header_column}", 4775 ';' 4776 ) 4777 4778 ELSE '' 4779 END 4780 """ 4781 ) 4782 4783 # Update query 4784 sql_query_update = f""" 4785 UPDATE {table_variants} as table_variants 4786 SET INFO = concat( 4787 CASE 4788 WHEN INFO NOT IN ('', '.') 4789 THEN INFO 4790 ELSE '' 4791 END, 4792 CASE 4793 WHEN table_variants.INFO NOT IN ('','.') 4794 THEN ';' 4795 ELSE '' 4796 END, 4797 ( 4798 SELECT 4799 concat( 4800 {",".join(sql_query_update_concat_fields)} 4801 ) 4802 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4803 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4804 AND table_parquet.\"START\" = table_variants.\"POS\" 4805 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4806 AND table_parquet.\"REF\" = table_variants.\"REF\" 4807 ) 4808 ) 4809 ; 4810 """ 4811 4812 # Update 4813 self.conn.execute(sql_query_update) 4814 4815 ### Annotate with VCF INFO field ### 4816 4817 # Init result VCF file 4818 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4819 4820 # If VCF exists 4821 if os.path.exists(output_results_vcf): 4822 4823 # Log 4824 log.debug("Exomiser result VCF update variants") 4825 4826 # Find Exomiser INFO field annotation in header 4827 with gzip.open(output_results_vcf, "rt") as f: 4828 header_list = self.read_vcf_header(f) 4829 exomiser_vcf_header = vcf.Reader( 4830 io.StringIO("\n".join(header_list)) 4831 ) 4832 4833 # Add annotation INFO field to header 4834 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4835 4836 # Update variants with VCF 4837 self.update_from_vcf(output_results_vcf) 4838 4839 return True 4840 4841 def annotation_snpeff(self, threads: int = None) -> None: 4842 """ 4843 This function annotate with snpEff 4844 4845 :param threads: The number of threads to use 4846 :return: the value of the variable "return_value". 4847 """ 4848 4849 # DEBUG 4850 log.debug("Start annotation with snpeff databases") 4851 4852 # Threads 4853 if not threads: 4854 threads = self.get_threads() 4855 log.debug("Threads: " + str(threads)) 4856 4857 # DEBUG 4858 delete_tmp = True 4859 if self.get_config().get("verbosity", "warning") in ["debug"]: 4860 delete_tmp = False 4861 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4862 4863 # Config 4864 config = self.get_config() 4865 log.debug("Config: " + str(config)) 4866 4867 # Config - Folders - Databases 4868 databases_folders = ( 4869 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4870 ) 4871 log.debug("Databases annotations: " + str(databases_folders)) 4872 4873 # # Config - Java 4874 # java_bin = get_bin( 4875 # tool="java", 4876 # bin="java", 4877 # bin_type="bin", 4878 # config=config, 4879 # default_folder="/usr/bin", 4880 # ) 4881 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4882 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4883 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4884 4885 # # Config - snpEff bin 4886 # snpeff_jar = get_bin( 4887 # tool="snpeff", 4888 # bin="snpEff.jar", 4889 # bin_type="jar", 4890 # config=config, 4891 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4892 # ) 4893 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4894 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4895 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4896 4897 # Config - snpEff bin command 4898 snpeff_bin_command = get_bin_command( 4899 bin="snpEff.jar", 4900 tool="snpeff", 4901 bin_type="jar", 4902 config=config, 4903 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4904 ) 4905 if not snpeff_bin_command: 4906 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4907 log.error(msg_err) 4908 raise ValueError(msg_err) 4909 4910 # Config - snpEff databases 4911 snpeff_databases = ( 4912 config.get("folders", {}) 4913 .get("databases", {}) 4914 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4915 ) 4916 snpeff_databases = full_path(snpeff_databases) 4917 if snpeff_databases is not None and snpeff_databases != "": 4918 log.debug(f"Create snpEff databases folder") 4919 if not os.path.exists(snpeff_databases): 4920 os.makedirs(snpeff_databases) 4921 4922 # Param 4923 param = self.get_param() 4924 log.debug("Param: " + str(param)) 4925 4926 # Param 4927 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4928 log.debug("Options: " + str(options)) 4929 4930 # Param - Assembly 4931 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4932 4933 # Param - Options 4934 snpeff_options = ( 4935 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4936 ) 4937 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4938 snpeff_csvstats = ( 4939 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4940 ) 4941 if snpeff_stats: 4942 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4943 snpeff_stats = full_path(snpeff_stats) 4944 snpeff_options += f" -stats {snpeff_stats}" 4945 if snpeff_csvstats: 4946 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4947 snpeff_csvstats = full_path(snpeff_csvstats) 4948 snpeff_options += f" -csvStats {snpeff_csvstats}" 4949 4950 # Data 4951 table_variants = self.get_table_variants() 4952 4953 # Check if not empty 4954 log.debug("Check if not empty") 4955 sql_query_chromosomes = ( 4956 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4957 ) 4958 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4959 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4960 log.info(f"VCF empty") 4961 return 4962 4963 # Export in VCF 4964 log.debug("Create initial file to annotate") 4965 tmp_vcf = NamedTemporaryFile( 4966 prefix=self.get_prefix(), 4967 dir=self.get_tmp_dir(), 4968 suffix=".vcf.gz", 4969 delete=True, 4970 ) 4971 tmp_vcf_name = tmp_vcf.name 4972 4973 # VCF header 4974 vcf_reader = self.get_header() 4975 log.debug("Initial header: " + str(vcf_reader.infos)) 4976 4977 # Existing annotations 4978 for vcf_annotation in self.get_header().infos: 4979 4980 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4981 log.debug( 4982 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4983 ) 4984 4985 # Memory limit 4986 # if config.get("memory", None): 4987 # memory_limit = config.get("memory", "8G") 4988 # else: 4989 # memory_limit = "8G" 4990 memory_limit = self.get_memory("8G") 4991 log.debug(f"memory_limit: {memory_limit}") 4992 4993 # snpEff java options 4994 snpeff_java_options = ( 4995 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4996 ) 4997 log.debug(f"Exomiser java options: {snpeff_java_options}") 4998 4999 force_update_annotation = True 5000 5001 if "ANN" not in self.get_header().infos or force_update_annotation: 5002 5003 # Check snpEff database 5004 log.debug(f"Check snpEff databases {[assembly]}") 5005 databases_download_snpeff( 5006 folder=snpeff_databases, assemblies=[assembly], config=config 5007 ) 5008 5009 # Export VCF file 5010 self.export_variant_vcf( 5011 vcf_file=tmp_vcf_name, 5012 remove_info=True, 5013 add_samples=False, 5014 index=True, 5015 ) 5016 5017 # Tmp file 5018 err_files = [] 5019 tmp_annotate_vcf = NamedTemporaryFile( 5020 prefix=self.get_prefix(), 5021 dir=self.get_tmp_dir(), 5022 suffix=".vcf", 5023 delete=False, 5024 ) 5025 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5026 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5027 err_files.append(tmp_annotate_vcf_name_err) 5028 5029 # Command 5030 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5031 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5032 run_parallel_commands([snpeff_command], 1) 5033 5034 # Error messages 5035 log.info(f"Error/Warning messages:") 5036 error_message_command_all = [] 5037 error_message_command_warning = [] 5038 error_message_command_err = [] 5039 for err_file in err_files: 5040 with open(err_file, "r") as f: 5041 for line in f: 5042 message = line.strip() 5043 error_message_command_all.append(message) 5044 if line.startswith("[W::"): 5045 error_message_command_warning.append(message) 5046 if line.startswith("[E::"): 5047 error_message_command_err.append(f"{err_file}: " + message) 5048 # log info 5049 for message in list( 5050 set(error_message_command_err + error_message_command_warning) 5051 ): 5052 log.info(f" {message}") 5053 # debug info 5054 for message in list(set(error_message_command_all)): 5055 log.debug(f" {message}") 5056 # failed 5057 if len(error_message_command_err): 5058 log.error("Annotation failed: Error in commands") 5059 raise ValueError("Annotation failed: Error in commands") 5060 5061 # Find annotation in header 5062 with open(tmp_annotate_vcf_name, "rt") as f: 5063 header_list = self.read_vcf_header(f) 5064 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5065 5066 for ann in annovar_vcf_header.infos: 5067 if ann not in self.get_header().infos: 5068 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5069 5070 # Update variants 5071 log.info(f"Annotation - Updating...") 5072 self.update_from_vcf(tmp_annotate_vcf_name) 5073 5074 else: 5075 if "ANN" in self.get_header().infos: 5076 log.debug(f"Existing snpEff annotations in VCF") 5077 if force_update_annotation: 5078 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5079 5080 def annotation_annovar(self, threads: int = None) -> None: 5081 """ 5082 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5083 annotations 5084 5085 :param threads: number of threads to use 5086 :return: the value of the variable "return_value". 5087 """ 5088 5089 # DEBUG 5090 log.debug("Start annotation with Annovar databases") 5091 5092 # Threads 5093 if not threads: 5094 threads = self.get_threads() 5095 log.debug("Threads: " + str(threads)) 5096 5097 # Tmp en Err files 5098 tmp_files = [] 5099 err_files = [] 5100 5101 # DEBUG 5102 delete_tmp = True 5103 if self.get_config().get("verbosity", "warning") in ["debug"]: 5104 delete_tmp = False 5105 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5106 5107 # Config 5108 config = self.get_config() 5109 log.debug("Config: " + str(config)) 5110 5111 # Config - Folders - Databases 5112 databases_folders = ( 5113 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5114 ) 5115 log.debug("Databases annotations: " + str(databases_folders)) 5116 5117 # Config - annovar bin command 5118 annovar_bin_command = get_bin_command( 5119 bin="table_annovar.pl", 5120 tool="annovar", 5121 bin_type="perl", 5122 config=config, 5123 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5124 ) 5125 if not annovar_bin_command: 5126 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5127 log.error(msg_err) 5128 raise ValueError(msg_err) 5129 5130 # Config - BCFTools bin command 5131 bcftools_bin_command = get_bin_command( 5132 bin="bcftools", 5133 tool="bcftools", 5134 bin_type="bin", 5135 config=config, 5136 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5137 ) 5138 if not bcftools_bin_command: 5139 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5140 log.error(msg_err) 5141 raise ValueError(msg_err) 5142 5143 # Config - annovar databases 5144 annovar_databases = ( 5145 config.get("folders", {}) 5146 .get("databases", {}) 5147 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5148 ) 5149 annovar_databases = full_path(annovar_databases) 5150 if annovar_databases != "" and not os.path.exists(annovar_databases): 5151 os.makedirs(annovar_databases) 5152 5153 # Param 5154 param = self.get_param() 5155 log.debug("Param: " + str(param)) 5156 5157 # Param - options 5158 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5159 log.debug("Options: " + str(options)) 5160 5161 # Param - annotations 5162 annotations = ( 5163 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5164 ) 5165 log.debug("Annotations: " + str(annotations)) 5166 5167 # Param - Assembly 5168 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5169 5170 # Annovar database assembly 5171 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5172 if annovar_databases_assembly != "" and not os.path.exists( 5173 annovar_databases_assembly 5174 ): 5175 os.makedirs(annovar_databases_assembly) 5176 5177 # Data 5178 table_variants = self.get_table_variants() 5179 5180 # Check if not empty 5181 log.debug("Check if not empty") 5182 sql_query_chromosomes = ( 5183 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5184 ) 5185 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5186 if not sql_query_chromosomes_df["count"][0]: 5187 log.info(f"VCF empty") 5188 return 5189 5190 # VCF header 5191 vcf_reader = self.get_header() 5192 log.debug("Initial header: " + str(vcf_reader.infos)) 5193 5194 # Existing annotations 5195 for vcf_annotation in self.get_header().infos: 5196 5197 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5198 log.debug( 5199 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5200 ) 5201 5202 force_update_annotation = True 5203 5204 if annotations: 5205 5206 commands = [] 5207 tmp_annotates_vcf_name_list = [] 5208 5209 # Export in VCF 5210 log.debug("Create initial file to annotate") 5211 tmp_vcf = NamedTemporaryFile( 5212 prefix=self.get_prefix(), 5213 dir=self.get_tmp_dir(), 5214 suffix=".vcf.gz", 5215 delete=False, 5216 ) 5217 tmp_vcf_name = tmp_vcf.name 5218 tmp_files.append(tmp_vcf_name) 5219 tmp_files.append(tmp_vcf_name + ".tbi") 5220 5221 # Export VCF file 5222 self.export_variant_vcf( 5223 vcf_file=tmp_vcf_name, 5224 remove_info=".", 5225 add_samples=False, 5226 index=True, 5227 ) 5228 5229 # Create file for field rename 5230 log.debug("Create file for field rename") 5231 tmp_rename = NamedTemporaryFile( 5232 prefix=self.get_prefix(), 5233 dir=self.get_tmp_dir(), 5234 suffix=".rename", 5235 delete=False, 5236 ) 5237 tmp_rename_name = tmp_rename.name 5238 tmp_files.append(tmp_rename_name) 5239 5240 # Check Annovar database 5241 log.debug( 5242 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5243 ) 5244 databases_download_annovar( 5245 folder=annovar_databases, 5246 files=list(annotations.keys()), 5247 assemblies=[assembly], 5248 ) 5249 5250 for annotation in annotations: 5251 annotation_fields = annotations[annotation] 5252 5253 if not annotation_fields: 5254 annotation_fields = {"INFO": None} 5255 5256 log.info(f"Annotations Annovar - database '{annotation}'") 5257 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5258 5259 # Tmp file for annovar 5260 err_files = [] 5261 tmp_annotate_vcf_directory = TemporaryDirectory( 5262 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5263 ) 5264 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5265 tmp_annotate_vcf_name_annovar = ( 5266 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5267 ) 5268 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5269 err_files.append(tmp_annotate_vcf_name_err) 5270 tmp_files.append(tmp_annotate_vcf_name_err) 5271 5272 # Tmp file final vcf annotated by annovar 5273 tmp_annotate_vcf = NamedTemporaryFile( 5274 prefix=self.get_prefix(), 5275 dir=self.get_tmp_dir(), 5276 suffix=".vcf.gz", 5277 delete=False, 5278 ) 5279 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5280 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5281 tmp_files.append(tmp_annotate_vcf_name) 5282 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5283 5284 # Number of fields 5285 annotation_list = [] 5286 annotation_renamed_list = [] 5287 5288 for annotation_field in annotation_fields: 5289 5290 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5291 annotation_fields_new_name = annotation_fields.get( 5292 annotation_field, annotation_field 5293 ) 5294 if not annotation_fields_new_name: 5295 annotation_fields_new_name = annotation_field 5296 5297 if ( 5298 force_update_annotation 5299 or annotation_fields_new_name not in self.get_header().infos 5300 ): 5301 annotation_list.append(annotation_field) 5302 annotation_renamed_list.append(annotation_fields_new_name) 5303 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5304 log.warning( 5305 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5306 ) 5307 5308 # Add rename info 5309 run_parallel_commands( 5310 [ 5311 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5312 ], 5313 1, 5314 ) 5315 5316 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5317 log.debug("annotation_list: " + str(annotation_list)) 5318 5319 # protocol 5320 protocol = annotation 5321 5322 # argument 5323 argument = "" 5324 5325 # operation 5326 operation = "f" 5327 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5328 "ensGene" 5329 ): 5330 operation = "g" 5331 if options.get("genebase", None): 5332 argument = f"""'{options.get("genebase","")}'""" 5333 elif annotation in ["cytoBand"]: 5334 operation = "r" 5335 5336 # argument option 5337 argument_option = "" 5338 if argument != "": 5339 argument_option = " --argument " + argument 5340 5341 # command options 5342 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5343 for option in options: 5344 if option not in ["genebase"]: 5345 command_options += f""" --{option}={options[option]}""" 5346 5347 # Command 5348 5349 # Command - Annovar 5350 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5351 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5352 5353 # Command - start pipe 5354 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5355 5356 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5357 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5358 5359 # Command - Special characters (refGene annotation) 5360 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5361 5362 # Command - Clean empty fields (with value ".") 5363 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5364 5365 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5366 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5367 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5368 # for ann in annotation_renamed_list: 5369 for ann in annotation_list: 5370 annovar_fields_to_keep.append(f"^INFO/{ann}") 5371 5372 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5373 5374 # Command - indexing 5375 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5376 5377 log.debug(f"Annotation - Annovar command: {command_annovar}") 5378 run_parallel_commands([command_annovar], 1) 5379 5380 # Error messages 5381 log.info(f"Error/Warning messages:") 5382 error_message_command_all = [] 5383 error_message_command_warning = [] 5384 error_message_command_err = [] 5385 for err_file in err_files: 5386 with open(err_file, "r") as f: 5387 for line in f: 5388 message = line.strip() 5389 error_message_command_all.append(message) 5390 if line.startswith("[W::") or line.startswith("WARNING"): 5391 error_message_command_warning.append(message) 5392 if line.startswith("[E::") or line.startswith("ERROR"): 5393 error_message_command_err.append( 5394 f"{err_file}: " + message 5395 ) 5396 # log info 5397 for message in list( 5398 set(error_message_command_err + error_message_command_warning) 5399 ): 5400 log.info(f" {message}") 5401 # debug info 5402 for message in list(set(error_message_command_all)): 5403 log.debug(f" {message}") 5404 # failed 5405 if len(error_message_command_err): 5406 log.error("Annotation failed: Error in commands") 5407 raise ValueError("Annotation failed: Error in commands") 5408 5409 if tmp_annotates_vcf_name_list: 5410 5411 # List of annotated files 5412 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5413 5414 # Tmp file 5415 tmp_annotate_vcf = NamedTemporaryFile( 5416 prefix=self.get_prefix(), 5417 dir=self.get_tmp_dir(), 5418 suffix=".vcf.gz", 5419 delete=False, 5420 ) 5421 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5422 tmp_files.append(tmp_annotate_vcf_name) 5423 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5424 err_files.append(tmp_annotate_vcf_name_err) 5425 tmp_files.append(tmp_annotate_vcf_name_err) 5426 5427 # Command merge 5428 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5429 log.info( 5430 f"Annotation Annovar - Annotation merging " 5431 + str(len(tmp_annotates_vcf_name_list)) 5432 + " annotated files" 5433 ) 5434 log.debug(f"Annotation - merge command: {merge_command}") 5435 run_parallel_commands([merge_command], 1) 5436 5437 # Find annotation in header 5438 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5439 header_list = self.read_vcf_header(f) 5440 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5441 5442 for ann in annovar_vcf_header.infos: 5443 if ann not in self.get_header().infos: 5444 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5445 5446 # Update variants 5447 log.info(f"Annotation Annovar - Updating...") 5448 self.update_from_vcf(tmp_annotate_vcf_name) 5449 5450 # Clean files 5451 # Tmp file remove command 5452 if True: 5453 tmp_files_remove_command = "" 5454 if tmp_files: 5455 tmp_files_remove_command = " ".join(tmp_files) 5456 clean_command = f" rm -f {tmp_files_remove_command} " 5457 log.debug(f"Annotation Annovar - Annotation cleaning ") 5458 log.debug(f"Annotation - cleaning command: {clean_command}") 5459 run_parallel_commands([clean_command], 1) 5460 5461 # Parquet 5462 def annotation_parquet(self, threads: int = None) -> None: 5463 """ 5464 It takes a VCF file, and annotates it with a parquet file 5465 5466 :param threads: number of threads to use for the annotation 5467 :return: the value of the variable "result". 5468 """ 5469 5470 # DEBUG 5471 log.debug("Start annotation with parquet databases") 5472 5473 # Threads 5474 if not threads: 5475 threads = self.get_threads() 5476 log.debug("Threads: " + str(threads)) 5477 5478 # DEBUG 5479 delete_tmp = True 5480 if self.get_config().get("verbosity", "warning") in ["debug"]: 5481 delete_tmp = False 5482 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5483 5484 # Config 5485 databases_folders = set( 5486 self.get_config() 5487 .get("folders", {}) 5488 .get("databases", {}) 5489 .get("annotations", ["."]) 5490 + self.get_config() 5491 .get("folders", {}) 5492 .get("databases", {}) 5493 .get("parquet", ["."]) 5494 ) 5495 log.debug("Databases annotations: " + str(databases_folders)) 5496 5497 # Param 5498 annotations = ( 5499 self.get_param() 5500 .get("annotation", {}) 5501 .get("parquet", {}) 5502 .get("annotations", None) 5503 ) 5504 log.debug("Annotations: " + str(annotations)) 5505 5506 # Assembly 5507 assembly = self.get_param().get( 5508 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5509 ) 5510 5511 # Force Update Annotation 5512 force_update_annotation = ( 5513 self.get_param() 5514 .get("annotation", {}) 5515 .get("options", {}) 5516 .get("annotations_update", False) 5517 ) 5518 log.debug(f"force_update_annotation={force_update_annotation}") 5519 force_append_annotation = ( 5520 self.get_param() 5521 .get("annotation", {}) 5522 .get("options", {}) 5523 .get("annotations_append", False) 5524 ) 5525 log.debug(f"force_append_annotation={force_append_annotation}") 5526 5527 # Data 5528 table_variants = self.get_table_variants() 5529 5530 # Check if not empty 5531 log.debug("Check if not empty") 5532 sql_query_chromosomes_df = self.get_query_to_df( 5533 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5534 ) 5535 if not sql_query_chromosomes_df["count"][0]: 5536 log.info(f"VCF empty") 5537 return 5538 5539 # VCF header 5540 vcf_reader = self.get_header() 5541 log.debug("Initial header: " + str(vcf_reader.infos)) 5542 5543 # Nb Variants POS 5544 log.debug("NB Variants Start") 5545 nb_variants = self.conn.execute( 5546 f"SELECT count(*) AS count FROM variants" 5547 ).fetchdf()["count"][0] 5548 log.debug("NB Variants Stop") 5549 5550 # Existing annotations 5551 for vcf_annotation in self.get_header().infos: 5552 5553 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5554 log.debug( 5555 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5556 ) 5557 5558 # Added columns 5559 added_columns = [] 5560 5561 # drop indexes 5562 log.debug(f"Drop indexes...") 5563 self.drop_indexes() 5564 5565 if annotations: 5566 5567 if "ALL" in annotations: 5568 5569 all_param = annotations.get("ALL", {}) 5570 all_param_formats = all_param.get("formats", None) 5571 all_param_releases = all_param.get("releases", None) 5572 5573 databases_infos_dict = self.scan_databases( 5574 database_formats=all_param_formats, 5575 database_releases=all_param_releases, 5576 ) 5577 for database_infos in databases_infos_dict.keys(): 5578 if database_infos not in annotations: 5579 annotations[database_infos] = {"INFO": None} 5580 5581 for annotation in annotations: 5582 5583 if annotation in ["ALL"]: 5584 continue 5585 5586 # Annotation Name 5587 annotation_name = os.path.basename(annotation) 5588 5589 # Annotation fields 5590 annotation_fields = annotations[annotation] 5591 if not annotation_fields: 5592 annotation_fields = {"INFO": None} 5593 5594 log.debug(f"Annotation '{annotation_name}'") 5595 log.debug( 5596 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5597 ) 5598 5599 # Create Database 5600 database = Database( 5601 database=annotation, 5602 databases_folders=databases_folders, 5603 assembly=assembly, 5604 ) 5605 5606 # Find files 5607 parquet_file = database.get_database() 5608 parquet_hdr_file = database.get_header_file() 5609 parquet_type = database.get_type() 5610 5611 # Check if files exists 5612 if not parquet_file or not parquet_hdr_file: 5613 log.error("Annotation failed: file not found") 5614 raise ValueError("Annotation failed: file not found") 5615 else: 5616 # Get parquet connexion 5617 parquet_sql_attach = database.get_sql_database_attach( 5618 output="query" 5619 ) 5620 if parquet_sql_attach: 5621 self.conn.execute(parquet_sql_attach) 5622 parquet_file_link = database.get_sql_database_link() 5623 # Log 5624 log.debug( 5625 f"Annotation '{annotation_name}' - file: " 5626 + str(parquet_file) 5627 + " and " 5628 + str(parquet_hdr_file) 5629 ) 5630 5631 # Database full header columns 5632 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5633 parquet_hdr_file 5634 ) 5635 # Log 5636 log.debug( 5637 "Annotation database header columns : " 5638 + str(parquet_hdr_vcf_header_columns) 5639 ) 5640 5641 # Load header as VCF object 5642 parquet_hdr_vcf_header_infos = database.get_header().infos 5643 # Log 5644 log.debug( 5645 "Annotation database header: " 5646 + str(parquet_hdr_vcf_header_infos) 5647 ) 5648 5649 # Get extra infos 5650 parquet_columns = database.get_extra_columns() 5651 # Log 5652 log.debug("Annotation database Columns: " + str(parquet_columns)) 5653 5654 # Add extra columns if "ALL" in annotation_fields 5655 # if "ALL" in annotation_fields: 5656 # allow_add_extra_column = True 5657 if "ALL" in annotation_fields and database.get_extra_columns(): 5658 for extra_column in database.get_extra_columns(): 5659 if ( 5660 extra_column not in annotation_fields 5661 and extra_column.replace("INFO/", "") 5662 not in parquet_hdr_vcf_header_infos 5663 ): 5664 parquet_hdr_vcf_header_infos[extra_column] = ( 5665 vcf.parser._Info( 5666 extra_column, 5667 ".", 5668 "String", 5669 f"{extra_column} description", 5670 "unknown", 5671 "unknown", 5672 self.code_type_map["String"], 5673 ) 5674 ) 5675 5676 # For all fields in database 5677 annotation_fields_all = False 5678 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5679 annotation_fields_all = True 5680 annotation_fields = { 5681 key: key for key in parquet_hdr_vcf_header_infos 5682 } 5683 5684 log.debug( 5685 "Annotation database header - All annotations added: " 5686 + str(annotation_fields) 5687 ) 5688 5689 # Init 5690 5691 # List of annotation fields to use 5692 sql_query_annotation_update_info_sets = [] 5693 5694 # List of annotation to agregate 5695 sql_query_annotation_to_agregate = [] 5696 5697 # Number of fields 5698 nb_annotation_field = 0 5699 5700 # Annotation fields processed 5701 annotation_fields_processed = [] 5702 5703 # Columns mapping 5704 map_columns = database.map_columns( 5705 columns=annotation_fields, prefixes=["INFO/"] 5706 ) 5707 5708 # Query dict for fields to remove (update option) 5709 query_dict_remove = {} 5710 5711 # Fetch Anotation fields 5712 for annotation_field in annotation_fields: 5713 5714 # annotation_field_column 5715 annotation_field_column = map_columns.get( 5716 annotation_field, "INFO" 5717 ) 5718 5719 # field new name, if parametered 5720 annotation_fields_new_name = annotation_fields.get( 5721 annotation_field, annotation_field 5722 ) 5723 if not annotation_fields_new_name: 5724 annotation_fields_new_name = annotation_field 5725 5726 # To annotate 5727 # force_update_annotation = True 5728 # force_append_annotation = True 5729 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5730 if annotation_field in parquet_hdr_vcf_header_infos and ( 5731 force_update_annotation 5732 or force_append_annotation 5733 or ( 5734 annotation_fields_new_name 5735 not in self.get_header().infos 5736 ) 5737 ): 5738 5739 # Add field to annotation to process list 5740 annotation_fields_processed.append( 5741 annotation_fields_new_name 5742 ) 5743 5744 # explode infos for the field 5745 annotation_fields_new_name_info_msg = "" 5746 if ( 5747 force_update_annotation 5748 and annotation_fields_new_name 5749 in self.get_header().infos 5750 ): 5751 # Remove field from INFO 5752 query = f""" 5753 UPDATE {table_variants} as table_variants 5754 SET INFO = REGEXP_REPLACE( 5755 concat(table_variants.INFO,''), 5756 ';*{annotation_fields_new_name}=[^;]*', 5757 '' 5758 ) 5759 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5760 """ 5761 annotation_fields_new_name_info_msg = " [update]" 5762 query_dict_remove[ 5763 f"remove 'INFO/{annotation_fields_new_name}'" 5764 ] = query 5765 5766 # Sep between fields in INFO 5767 nb_annotation_field += 1 5768 if nb_annotation_field > 1: 5769 annotation_field_sep = ";" 5770 else: 5771 annotation_field_sep = "" 5772 5773 log.info( 5774 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5775 ) 5776 5777 # Add INFO field to header 5778 parquet_hdr_vcf_header_infos_number = ( 5779 parquet_hdr_vcf_header_infos[annotation_field].num 5780 or "." 5781 ) 5782 parquet_hdr_vcf_header_infos_type = ( 5783 parquet_hdr_vcf_header_infos[annotation_field].type 5784 or "String" 5785 ) 5786 parquet_hdr_vcf_header_infos_description = ( 5787 parquet_hdr_vcf_header_infos[annotation_field].desc 5788 or f"{annotation_field} description" 5789 ) 5790 parquet_hdr_vcf_header_infos_source = ( 5791 parquet_hdr_vcf_header_infos[annotation_field].source 5792 or "unknown" 5793 ) 5794 parquet_hdr_vcf_header_infos_version = ( 5795 parquet_hdr_vcf_header_infos[annotation_field].version 5796 or "unknown" 5797 ) 5798 5799 vcf_reader.infos[annotation_fields_new_name] = ( 5800 vcf.parser._Info( 5801 annotation_fields_new_name, 5802 parquet_hdr_vcf_header_infos_number, 5803 parquet_hdr_vcf_header_infos_type, 5804 parquet_hdr_vcf_header_infos_description, 5805 parquet_hdr_vcf_header_infos_source, 5806 parquet_hdr_vcf_header_infos_version, 5807 self.code_type_map[ 5808 parquet_hdr_vcf_header_infos_type 5809 ], 5810 ) 5811 ) 5812 5813 # Append 5814 if force_append_annotation: 5815 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5816 else: 5817 query_case_when_append = "" 5818 5819 # Annotation/Update query fields 5820 # Found in INFO column 5821 if ( 5822 annotation_field_column == "INFO" 5823 and "INFO" in parquet_hdr_vcf_header_columns 5824 ): 5825 sql_query_annotation_update_info_sets.append( 5826 f""" 5827 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5828 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5829 ELSE '' 5830 END 5831 """ 5832 ) 5833 # Found in a specific column 5834 else: 5835 # sql_query_annotation_update_info_sets.append( 5836 # f""" 5837 # CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5838 # THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5839 # ELSE '' 5840 # END 5841 # """ 5842 # ) 5843 sql_query_annotation_update_info_sets.append( 5844 f""" 5845 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5846 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5847 ELSE '' 5848 END 5849 """ 5850 ) 5851 sql_query_annotation_to_agregate.append( 5852 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5853 ) 5854 5855 # Not to annotate 5856 else: 5857 5858 if force_update_annotation: 5859 annotation_message = "forced" 5860 else: 5861 annotation_message = "skipped" 5862 5863 if annotation_field not in parquet_hdr_vcf_header_infos: 5864 log.warning( 5865 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5866 ) 5867 if annotation_fields_new_name in self.get_header().infos: 5868 log.warning( 5869 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5870 ) 5871 5872 # Check if ALL fields have to be annotated. Thus concat all INFO field 5873 # allow_annotation_full_info = True 5874 allow_annotation_full_info = not force_append_annotation 5875 5876 if parquet_type in ["regions"]: 5877 allow_annotation_full_info = False 5878 5879 if ( 5880 allow_annotation_full_info 5881 and nb_annotation_field == len(annotation_fields) 5882 and annotation_fields_all 5883 and ( 5884 "INFO" in parquet_hdr_vcf_header_columns 5885 and "INFO" in database.get_extra_columns() 5886 ) 5887 ): 5888 log.debug("Column INFO annotation enabled") 5889 sql_query_annotation_update_info_sets = [] 5890 sql_query_annotation_update_info_sets.append( 5891 f" table_parquet.INFO " 5892 ) 5893 5894 if sql_query_annotation_update_info_sets: 5895 5896 # Annotate 5897 log.info(f"Annotation '{annotation_name}' - Annotation...") 5898 5899 # Join query annotation update info sets for SQL 5900 sql_query_annotation_update_info_sets_sql = ",".join( 5901 sql_query_annotation_update_info_sets 5902 ) 5903 5904 # Check chromosomes list (and variants infos) 5905 sql_query_chromosomes = f""" 5906 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5907 FROM {table_variants} as table_variants 5908 GROUP BY table_variants."#CHROM" 5909 ORDER BY table_variants."#CHROM" 5910 """ 5911 sql_query_chromosomes_df = self.conn.execute( 5912 sql_query_chromosomes 5913 ).df() 5914 sql_query_chromosomes_dict = { 5915 entry["CHROM"]: { 5916 "count": entry["count_variants"], 5917 "min": entry["min_variants"], 5918 "max": entry["max_variants"], 5919 } 5920 for index, entry in sql_query_chromosomes_df.iterrows() 5921 } 5922 5923 # Init 5924 nb_of_query = 0 5925 nb_of_variant_annotated = 0 5926 query_dict = query_dict_remove 5927 5928 # for chrom in sql_query_chromosomes_df["CHROM"]: 5929 for chrom in sql_query_chromosomes_dict: 5930 5931 # Number of variant by chromosome 5932 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5933 chrom, {} 5934 ).get("count", 0) 5935 5936 log.debug( 5937 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5938 ) 5939 5940 # Annotation with regions database 5941 if parquet_type in ["regions"]: 5942 sql_query_annotation_from_clause = f""" 5943 FROM ( 5944 SELECT 5945 '{chrom}' AS \"#CHROM\", 5946 table_variants_from.\"POS\" AS \"POS\", 5947 {",".join(sql_query_annotation_to_agregate)} 5948 FROM {table_variants} as table_variants_from 5949 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5950 table_parquet_from."#CHROM" = '{chrom}' 5951 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5952 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5953 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5954 ) 5955 ) 5956 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5957 GROUP BY table_variants_from.\"POS\" 5958 ) 5959 as table_parquet 5960 """ 5961 5962 sql_query_annotation_where_clause = """ 5963 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5964 AND table_parquet.\"POS\" = table_variants.\"POS\" 5965 """ 5966 5967 # Annotation with variants database 5968 else: 5969 sql_query_annotation_from_clause = f""" 5970 FROM {parquet_file_link} as table_parquet 5971 """ 5972 sql_query_annotation_where_clause = f""" 5973 table_variants."#CHROM" = '{chrom}' 5974 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5975 AND table_parquet.\"POS\" = table_variants.\"POS\" 5976 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5977 AND table_parquet.\"REF\" = table_variants.\"REF\" 5978 """ 5979 5980 # Create update query 5981 sql_query_annotation_chrom_interval_pos = f""" 5982 UPDATE {table_variants} as table_variants 5983 SET INFO = 5984 concat( 5985 CASE WHEN table_variants.INFO NOT IN ('','.') 5986 THEN table_variants.INFO 5987 ELSE '' 5988 END 5989 , 5990 CASE WHEN table_variants.INFO NOT IN ('','.') 5991 AND ( 5992 concat({sql_query_annotation_update_info_sets_sql}) 5993 ) 5994 NOT IN ('','.') 5995 THEN ';' 5996 ELSE '' 5997 END 5998 , 5999 {sql_query_annotation_update_info_sets_sql} 6000 ) 6001 {sql_query_annotation_from_clause} 6002 WHERE {sql_query_annotation_where_clause} 6003 ; 6004 """ 6005 6006 # Add update query to dict 6007 query_dict[ 6008 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6009 ] = sql_query_annotation_chrom_interval_pos 6010 6011 nb_of_query = len(query_dict) 6012 num_query = 0 6013 6014 # SET max_expression_depth TO x 6015 self.conn.execute("SET max_expression_depth TO 10000") 6016 6017 for query_name in query_dict: 6018 query = query_dict[query_name] 6019 num_query += 1 6020 log.info( 6021 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6022 ) 6023 result = self.conn.execute(query) 6024 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6025 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6026 log.info( 6027 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6028 ) 6029 6030 log.info( 6031 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6032 ) 6033 6034 else: 6035 6036 log.info( 6037 f"Annotation '{annotation_name}' - No Annotations available" 6038 ) 6039 6040 log.debug("Final header: " + str(vcf_reader.infos)) 6041 6042 # Remove added columns 6043 for added_column in added_columns: 6044 self.drop_column(column=added_column) 6045 6046 def annotation_splice(self, threads: int = None) -> None: 6047 """ 6048 This function annotate with snpEff 6049 6050 :param threads: The number of threads to use 6051 :return: the value of the variable "return_value". 6052 """ 6053 6054 # DEBUG 6055 log.debug("Start annotation with splice tools") 6056 6057 # Threads 6058 if not threads: 6059 threads = self.get_threads() 6060 log.debug("Threads: " + str(threads)) 6061 6062 # DEBUG 6063 delete_tmp = True 6064 if self.get_config().get("verbosity", "warning") in ["debug"]: 6065 delete_tmp = False 6066 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6067 6068 # Config 6069 config = self.get_config() 6070 log.debug("Config: " + str(config)) 6071 splice_config = config.get("tools", {}).get("splice", {}) 6072 if not splice_config: 6073 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6074 if not splice_config: 6075 msg_err = "No Splice tool config" 6076 log.error(msg_err) 6077 raise ValueError(msg_err) 6078 log.debug(f"splice_config={splice_config}") 6079 6080 # Config - Folders - Databases 6081 databases_folders = ( 6082 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6083 ) 6084 log.debug("Databases annotations: " + str(databases_folders)) 6085 6086 # Splice docker image 6087 splice_docker_image = splice_config.get("docker").get("image") 6088 6089 # Pull splice image if it's not already there 6090 if not check_docker_image_exists(splice_docker_image): 6091 log.warning( 6092 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6093 ) 6094 try: 6095 command(f"docker pull {splice_config.get('docker').get('image')}") 6096 except subprocess.CalledProcessError: 6097 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6098 log.error(msg_err) 6099 raise ValueError(msg_err) 6100 return None 6101 6102 # Config - splice databases 6103 splice_databases = ( 6104 config.get("folders", {}) 6105 .get("databases", {}) 6106 .get("splice", DEFAULT_SPLICE_FOLDER) 6107 ) 6108 splice_databases = full_path(splice_databases) 6109 6110 # Param 6111 param = self.get_param() 6112 log.debug("Param: " + str(param)) 6113 6114 # Param 6115 options = param.get("annotation", {}).get("splice", {}) 6116 log.debug("Options: " + str(options)) 6117 6118 # Data 6119 table_variants = self.get_table_variants() 6120 6121 # Check if not empty 6122 log.debug("Check if not empty") 6123 sql_query_chromosomes = ( 6124 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6125 ) 6126 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6127 log.info("VCF empty") 6128 return None 6129 6130 # Export in VCF 6131 log.debug("Create initial file to annotate") 6132 6133 # Create output folder 6134 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6135 if not os.path.exists(output_folder): 6136 Path(output_folder).mkdir(parents=True, exist_ok=True) 6137 6138 # Create tmp VCF file 6139 tmp_vcf = NamedTemporaryFile( 6140 prefix=self.get_prefix(), 6141 dir=output_folder, 6142 suffix=".vcf", 6143 delete=False, 6144 ) 6145 tmp_vcf_name = tmp_vcf.name 6146 6147 # VCF header 6148 header = self.get_header() 6149 6150 # Existing annotations 6151 for vcf_annotation in self.get_header().infos: 6152 6153 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6154 log.debug( 6155 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6156 ) 6157 6158 # Memory limit 6159 if config.get("memory", None): 6160 memory_limit = config.get("memory", "8G").upper() 6161 # upper() 6162 else: 6163 memory_limit = "8G" 6164 log.debug(f"memory_limit: {memory_limit}") 6165 6166 # Check number of variants to annotate 6167 where_clause_regex_spliceai = r"SpliceAI_\w+" 6168 where_clause_regex_spip = r"SPiP_\w+" 6169 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6170 df_list_of_variants_to_annotate = self.get_query_to_df( 6171 query=f""" SELECT * FROM variants {where_clause} """ 6172 ) 6173 if len(df_list_of_variants_to_annotate) == 0: 6174 log.warning( 6175 f"No variants to annotate with splice. Variants probably already annotated with splice" 6176 ) 6177 return None 6178 else: 6179 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6180 6181 # Export VCF file 6182 self.export_variant_vcf( 6183 vcf_file=tmp_vcf_name, 6184 remove_info=True, 6185 add_samples=True, 6186 index=False, 6187 where_clause=where_clause, 6188 ) 6189 6190 # Create docker container and launch splice analysis 6191 if splice_config: 6192 6193 # Splice mount folders 6194 mount_folders = splice_config.get("mount", {}) 6195 6196 # Genome mount 6197 mount_folders[ 6198 config.get("folders", {}) 6199 .get("databases", {}) 6200 .get("genomes", DEFAULT_GENOME_FOLDER) 6201 ] = "ro" 6202 6203 # SpliceAI mount 6204 mount_folders[ 6205 config.get("folders", {}) 6206 .get("databases", {}) 6207 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6208 ] = "ro" 6209 6210 # Genome mount 6211 mount_folders[ 6212 config.get("folders", {}) 6213 .get("databases", {}) 6214 .get("spip", DEFAULT_SPIP_FOLDER) 6215 ] = "ro" 6216 6217 # Mount folders 6218 mount = [] 6219 6220 # Config mount 6221 mount = [ 6222 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6223 for path, mode in mount_folders.items() 6224 ] 6225 6226 if any(value for value in splice_config.values() if value is None): 6227 log.warning("At least one splice config parameter is empty") 6228 return None 6229 6230 # Params in splice nf 6231 def check_values(dico: dict): 6232 """ 6233 Ensure parameters for NF splice pipeline 6234 """ 6235 for key, val in dico.items(): 6236 if key == "genome": 6237 if any( 6238 assemb in options.get("genome", {}) 6239 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6240 ): 6241 yield f"--{key} hg19" 6242 elif any( 6243 assemb in options.get("genome", {}) 6244 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6245 ): 6246 yield f"--{key} hg38" 6247 elif ( 6248 (isinstance(val, str) and val) 6249 or isinstance(val, int) 6250 or isinstance(val, bool) 6251 ): 6252 yield f"--{key} {val}" 6253 6254 # Genome 6255 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6256 options["genome"] = genome 6257 6258 # NF params 6259 nf_params = [] 6260 6261 # Add options 6262 if options: 6263 nf_params = list(check_values(options)) 6264 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6265 else: 6266 log.debug("No NF params provided") 6267 6268 # Add threads 6269 if "threads" not in options.keys(): 6270 nf_params.append(f"--threads {threads}") 6271 6272 # Genome path 6273 genome_path = find_genome( 6274 config.get("folders", {}) 6275 .get("databases", {}) 6276 .get("genomes", DEFAULT_GENOME_FOLDER), 6277 file=f"{genome}.fa", 6278 ) 6279 # Add genome path 6280 if not genome_path: 6281 raise ValueError( 6282 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6283 ) 6284 else: 6285 log.debug(f"Genome: {genome_path}") 6286 nf_params.append(f"--genome_path {genome_path}") 6287 6288 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6289 """ 6290 Setting up updated databases for SPiP and SpliceAI 6291 """ 6292 6293 try: 6294 6295 # SpliceAI assembly transcriptome 6296 spliceai_assembly = os.path.join( 6297 config.get("folders", {}) 6298 .get("databases", {}) 6299 .get("spliceai", {}), 6300 options.get("genome"), 6301 "transcriptome", 6302 ) 6303 spip_assembly = options.get("genome") 6304 6305 spip = find( 6306 f"transcriptome_{spip_assembly}.RData", 6307 config.get("folders", {}).get("databases", {}).get("spip", {}), 6308 ) 6309 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6310 log.debug(f"SPiP annotations: {spip}") 6311 log.debug(f"SpliceAI annotations: {spliceai}") 6312 if spip and spliceai: 6313 return [ 6314 f"--spip_transcriptome {spip}", 6315 f"--spliceai_annotations {spliceai}", 6316 ] 6317 else: 6318 # TODO crash and go on with basic annotations ? 6319 # raise ValueError( 6320 # "Can't find splice databases in configuration EXIT" 6321 # ) 6322 log.warning( 6323 "Can't find splice databases in configuration, use annotations file from image" 6324 ) 6325 except TypeError: 6326 log.warning( 6327 "Can't find splice databases in configuration, use annotations file from image" 6328 ) 6329 return [] 6330 6331 # Add options, check if transcriptome option have already beend provided 6332 if ( 6333 "spip_transcriptome" not in nf_params 6334 and "spliceai_transcriptome" not in nf_params 6335 ): 6336 splice_reference = splice_annotations(options, config) 6337 if splice_reference: 6338 nf_params.extend(splice_reference) 6339 6340 nf_params.append(f"--output_folder {output_folder}") 6341 6342 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6343 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6344 log.debug(cmd) 6345 6346 splice_config["docker"]["command"] = cmd 6347 6348 docker_cmd = get_bin_command( 6349 tool="splice", 6350 bin_type="docker", 6351 config=config, 6352 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6353 add_options=f"--name {random_uuid} {' '.join(mount)}", 6354 ) 6355 6356 # Docker debug 6357 # if splice_config.get("rm_container"): 6358 # rm_container = "--rm" 6359 # else: 6360 # rm_container = "" 6361 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6362 6363 log.debug(docker_cmd) 6364 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6365 log.debug(res.stdout) 6366 if res.stderr: 6367 log.error(res.stderr) 6368 res.check_returncode() 6369 else: 6370 log.warning(f"Splice tool configuration not found: {config}") 6371 6372 # Update variants 6373 log.info("Annotation - Updating...") 6374 # Test find output vcf 6375 log.debug( 6376 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6377 ) 6378 output_vcf = [] 6379 # Wrong folder to look in 6380 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6381 if ( 6382 files 6383 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6384 ): 6385 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6386 # log.debug(os.listdir(options.get("output_folder"))) 6387 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6388 if not output_vcf: 6389 log.debug( 6390 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6391 ) 6392 else: 6393 # Get new header from annotated vcf 6394 log.debug(f"Initial header: {len(header.infos)} fields") 6395 # Create new header with splice infos 6396 new_vcf = Variants(input=output_vcf[0]) 6397 new_vcf_header = new_vcf.get_header().infos 6398 for keys, infos in new_vcf_header.items(): 6399 if keys not in header.infos.keys(): 6400 header.infos[keys] = infos 6401 log.debug(f"New header: {len(header.infos)} fields") 6402 log.debug(f"Splice tmp output: {output_vcf[0]}") 6403 self.update_from_vcf(output_vcf[0]) 6404 6405 # Remove folder 6406 remove_if_exists(output_folder) 6407 6408 ### 6409 # Prioritization 6410 ### 6411 6412 def get_config_default(self, name: str) -> dict: 6413 """ 6414 The function `get_config_default` returns a dictionary containing default configurations for 6415 various calculations and prioritizations. 6416 6417 :param name: The `get_config_default` function returns a dictionary containing default 6418 configurations for different calculations and prioritizations. The `name` parameter is used to 6419 specify which specific configuration to retrieve from the dictionary 6420 :type name: str 6421 :return: The function `get_config_default` returns a dictionary containing default configuration 6422 settings for different calculations and prioritizations. The specific configuration settings are 6423 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6424 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6425 returned. If there is no match, an empty dictionary is returned. 6426 """ 6427 6428 config_default = { 6429 "calculations": { 6430 "variant_chr_pos_alt_ref": { 6431 "type": "sql", 6432 "name": "variant_chr_pos_alt_ref", 6433 "description": "Create a variant ID with chromosome, position, alt and ref", 6434 "available": False, 6435 "output_column_name": "variant_chr_pos_alt_ref", 6436 "output_column_type": "String", 6437 "output_column_description": "variant ID with chromosome, position, alt and ref", 6438 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6439 "operation_info": True, 6440 }, 6441 "VARTYPE": { 6442 "type": "sql", 6443 "name": "VARTYPE", 6444 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6445 "available": True, 6446 "output_column_name": "VARTYPE", 6447 "output_column_type": "String", 6448 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6449 "operation_query": """ 6450 CASE 6451 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6452 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6453 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6454 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6455 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6456 ELSE 'UNDEFINED' 6457 END 6458 """, 6459 "info_fields": ["SVTYPE"], 6460 "operation_info": True, 6461 }, 6462 "snpeff_hgvs": { 6463 "type": "python", 6464 "name": "snpeff_hgvs", 6465 "description": "HGVS nomenclatures from snpEff annotation", 6466 "available": True, 6467 "function_name": "calculation_extract_snpeff_hgvs", 6468 "function_params": ["snpeff_hgvs", "ANN"], 6469 }, 6470 "snpeff_ann_explode": { 6471 "type": "python", 6472 "name": "snpeff_ann_explode", 6473 "description": "Explode snpEff annotations with uniquify values", 6474 "available": True, 6475 "function_name": "calculation_snpeff_ann_explode", 6476 "function_params": [False, "fields", "snpeff_", "ANN"], 6477 }, 6478 "snpeff_ann_explode_uniquify": { 6479 "type": "python", 6480 "name": "snpeff_ann_explode_uniquify", 6481 "description": "Explode snpEff annotations", 6482 "available": True, 6483 "function_name": "calculation_snpeff_ann_explode", 6484 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6485 }, 6486 "snpeff_ann_explode_json": { 6487 "type": "python", 6488 "name": "snpeff_ann_explode_json", 6489 "description": "Explode snpEff annotations in JSON format", 6490 "available": True, 6491 "function_name": "calculation_snpeff_ann_explode", 6492 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6493 }, 6494 "NOMEN": { 6495 "type": "python", 6496 "name": "NOMEN", 6497 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6498 "available": True, 6499 "function_name": "calculation_extract_nomen", 6500 "function_params": [], 6501 }, 6502 "FINDBYPIPELINE": { 6503 "type": "python", 6504 "name": "FINDBYPIPELINE", 6505 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6506 "available": True, 6507 "function_name": "calculation_find_by_pipeline", 6508 "function_params": ["findbypipeline"], 6509 }, 6510 "FINDBYSAMPLE": { 6511 "type": "python", 6512 "name": "FINDBYSAMPLE", 6513 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6514 "available": True, 6515 "function_name": "calculation_find_by_pipeline", 6516 "function_params": ["findbysample"], 6517 }, 6518 "GENOTYPECONCORDANCE": { 6519 "type": "python", 6520 "name": "GENOTYPECONCORDANCE", 6521 "description": "Concordance of genotype for multi caller VCF", 6522 "available": True, 6523 "function_name": "calculation_genotype_concordance", 6524 "function_params": [], 6525 }, 6526 "BARCODE": { 6527 "type": "python", 6528 "name": "BARCODE", 6529 "description": "BARCODE as VaRank tool", 6530 "available": True, 6531 "function_name": "calculation_barcode", 6532 "function_params": [], 6533 }, 6534 "BARCODEFAMILY": { 6535 "type": "python", 6536 "name": "BARCODEFAMILY", 6537 "description": "BARCODEFAMILY as VaRank tool", 6538 "available": True, 6539 "function_name": "calculation_barcode_family", 6540 "function_params": ["BCF"], 6541 }, 6542 "TRIO": { 6543 "type": "python", 6544 "name": "TRIO", 6545 "description": "Inheritance for a trio family", 6546 "available": True, 6547 "function_name": "calculation_trio", 6548 "function_params": [], 6549 }, 6550 "VAF": { 6551 "type": "python", 6552 "name": "VAF", 6553 "description": "Variant Allele Frequency (VAF) harmonization", 6554 "available": True, 6555 "function_name": "calculation_vaf_normalization", 6556 "function_params": [], 6557 }, 6558 "VAF_stats": { 6559 "type": "python", 6560 "name": "VAF_stats", 6561 "description": "Variant Allele Frequency (VAF) statistics", 6562 "available": True, 6563 "function_name": "calculation_genotype_stats", 6564 "function_params": ["VAF"], 6565 }, 6566 "DP_stats": { 6567 "type": "python", 6568 "name": "DP_stats", 6569 "description": "Depth (DP) statistics", 6570 "available": True, 6571 "function_name": "calculation_genotype_stats", 6572 "function_params": ["DP"], 6573 }, 6574 "variant_id": { 6575 "type": "python", 6576 "name": "variant_id", 6577 "description": "Variant ID generated from variant position and type", 6578 "available": True, 6579 "function_name": "calculation_variant_id", 6580 "function_params": [], 6581 }, 6582 "transcripts_json": { 6583 "type": "python", 6584 "name": "transcripts_json", 6585 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6586 "available": True, 6587 "function_name": "calculation_transcripts_annotation", 6588 "function_params": ["transcripts_json", None], 6589 }, 6590 "transcripts_ann": { 6591 "type": "python", 6592 "name": "transcripts_ann", 6593 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6594 "available": True, 6595 "function_name": "calculation_transcripts_annotation", 6596 "function_params": [None, "transcripts_ann"], 6597 }, 6598 "transcripts_annotations": { 6599 "type": "python", 6600 "name": "transcripts_annotations", 6601 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6602 "available": True, 6603 "function_name": "calculation_transcripts_annotation", 6604 "function_params": [None, None], 6605 }, 6606 "transcripts_prioritization": { 6607 "type": "python", 6608 "name": "transcripts_prioritization", 6609 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6610 "available": True, 6611 "function_name": "calculation_transcripts_prioritization", 6612 "function_params": [], 6613 }, 6614 }, 6615 "prioritizations": { 6616 "default": { 6617 "filter": [ 6618 { 6619 "type": "notequals", 6620 "value": "!PASS|\\.", 6621 "score": 0, 6622 "flag": "FILTERED", 6623 "comment": ["Bad variant quality"], 6624 }, 6625 { 6626 "type": "equals", 6627 "value": "REJECT", 6628 "score": -20, 6629 "flag": "PASS", 6630 "comment": ["Bad variant quality"], 6631 }, 6632 ], 6633 "DP": [ 6634 { 6635 "type": "gte", 6636 "value": "50", 6637 "score": 5, 6638 "flag": "PASS", 6639 "comment": ["DP higher than 50"], 6640 } 6641 ], 6642 "ANN": [ 6643 { 6644 "type": "contains", 6645 "value": "HIGH", 6646 "score": 5, 6647 "flag": "PASS", 6648 "comment": [ 6649 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6650 ], 6651 }, 6652 { 6653 "type": "contains", 6654 "value": "MODERATE", 6655 "score": 3, 6656 "flag": "PASS", 6657 "comment": [ 6658 "A non-disruptive variant that might change protein effectiveness" 6659 ], 6660 }, 6661 { 6662 "type": "contains", 6663 "value": "LOW", 6664 "score": 0, 6665 "flag": "FILTERED", 6666 "comment": [ 6667 "Assumed to be mostly harmless or unlikely to change protein behavior" 6668 ], 6669 }, 6670 { 6671 "type": "contains", 6672 "value": "MODIFIER", 6673 "score": 0, 6674 "flag": "FILTERED", 6675 "comment": [ 6676 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6677 ], 6678 }, 6679 ], 6680 } 6681 }, 6682 } 6683 6684 return config_default.get(name, None) 6685 6686 def get_config_json( 6687 self, name: str, config_dict: dict = {}, config_file: str = None 6688 ) -> dict: 6689 """ 6690 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6691 default values, a dictionary, and a file. 6692 6693 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6694 the name of the configuration. It is used to identify and retrieve the configuration settings 6695 for a specific component or module 6696 :type name: str 6697 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6698 dictionary that allows you to provide additional configuration settings or overrides. When you 6699 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6700 the key is the configuration setting you want to override or 6701 :type config_dict: dict 6702 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6703 specify the path to a configuration file that contains additional settings. If provided, the 6704 function will read the contents of this file and update the configuration dictionary with the 6705 values found in the file, overriding any existing values with the 6706 :type config_file: str 6707 :return: The function `get_config_json` returns a dictionary containing the configuration 6708 settings. 6709 """ 6710 6711 # Create with default prioritizations 6712 config_default = self.get_config_default(name=name) 6713 configuration = config_default 6714 # log.debug(f"configuration={configuration}") 6715 6716 # Replace prioritizations from dict 6717 for config in config_dict: 6718 configuration[config] = config_dict[config] 6719 6720 # Replace prioritizations from file 6721 config_file = full_path(config_file) 6722 if config_file: 6723 if os.path.exists(config_file): 6724 with open(config_file) as config_file_content: 6725 config_file_dict = json.load(config_file_content) 6726 for config in config_file_dict: 6727 configuration[config] = config_file_dict[config] 6728 else: 6729 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6730 log.error(msg_error) 6731 raise ValueError(msg_error) 6732 6733 return configuration 6734 6735 def prioritization( 6736 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6737 ) -> bool: 6738 """ 6739 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6740 prioritizes variants based on configured profiles and criteria. 6741 6742 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6743 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6744 a table name is provided, the method will prioritize the variants in that specific table 6745 :type table: str 6746 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6747 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6748 provided, the code will use a default prefix value of "PZ" 6749 :type pz_prefix: str 6750 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6751 additional parameters specific to the prioritization process. These parameters can include 6752 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6753 configurations needed for the prioritization of variants in a V 6754 :type pz_param: dict 6755 :return: A boolean value (True) is being returned from the `prioritization` function. 6756 """ 6757 6758 # Config 6759 config = self.get_config() 6760 6761 # Param 6762 param = self.get_param() 6763 6764 # Prioritization param 6765 if pz_param is not None: 6766 prioritization_param = pz_param 6767 else: 6768 prioritization_param = param.get("prioritization", {}) 6769 6770 # Configuration profiles 6771 prioritization_config_file = prioritization_param.get( 6772 "prioritization_config", None 6773 ) 6774 prioritization_config_file = full_path(prioritization_config_file) 6775 prioritizations_config = self.get_config_json( 6776 name="prioritizations", config_file=prioritization_config_file 6777 ) 6778 6779 # Prioritization prefix 6780 pz_prefix_default = "PZ" 6781 if pz_prefix is None: 6782 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6783 6784 # Prioritization options 6785 profiles = prioritization_param.get("profiles", []) 6786 if isinstance(profiles, str): 6787 profiles = profiles.split(",") 6788 pzfields = prioritization_param.get( 6789 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6790 ) 6791 if isinstance(pzfields, str): 6792 pzfields = pzfields.split(",") 6793 default_profile = prioritization_param.get("default_profile", None) 6794 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6795 prioritization_score_mode = prioritization_param.get( 6796 "prioritization_score_mode", "HOWARD" 6797 ) 6798 6799 # Quick Prioritizations 6800 prioritizations = param.get("prioritizations", None) 6801 if prioritizations: 6802 log.info("Quick Prioritization:") 6803 for profile in prioritizations.split(","): 6804 if profile not in profiles: 6805 profiles.append(profile) 6806 log.info(f" {profile}") 6807 6808 # If profile "ALL" provided, all profiles in the config profiles 6809 if "ALL" in profiles: 6810 profiles = list(prioritizations_config.keys()) 6811 6812 for profile in profiles: 6813 if prioritizations_config.get(profile, None): 6814 log.debug(f"Profile '{profile}' configured") 6815 else: 6816 msg_error = f"Profile '{profile}' NOT configured" 6817 log.error(msg_error) 6818 raise ValueError(msg_error) 6819 6820 if profiles: 6821 log.info(f"Prioritization... ") 6822 else: 6823 log.debug(f"No profile defined") 6824 return False 6825 6826 if not default_profile and len(profiles): 6827 default_profile = profiles[0] 6828 6829 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6830 log.debug("Profiles to check: " + str(list(profiles))) 6831 6832 # Variables 6833 if table is not None: 6834 table_variants = table 6835 else: 6836 table_variants = self.get_table_variants(clause="update") 6837 log.debug(f"Table to prioritize: {table_variants}") 6838 6839 # Added columns 6840 added_columns = [] 6841 6842 # Create list of PZfields 6843 # List of PZFields 6844 list_of_pzfields_original = pzfields + [ 6845 pzfield + pzfields_sep + profile 6846 for pzfield in pzfields 6847 for profile in profiles 6848 ] 6849 list_of_pzfields = [] 6850 log.debug(f"{list_of_pzfields_original}") 6851 6852 # Remove existing PZfields to use if exists 6853 for pzfield in list_of_pzfields_original: 6854 if self.get_header().infos.get(pzfield, None) is None: 6855 list_of_pzfields.append(pzfield) 6856 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6857 else: 6858 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6859 6860 if list_of_pzfields: 6861 6862 # Explode Infos prefix 6863 explode_infos_prefix = self.get_explode_infos_prefix() 6864 6865 # PZfields tags description 6866 PZfields_INFOS = { 6867 f"{pz_prefix}Tags": { 6868 "ID": f"{pz_prefix}Tags", 6869 "Number": ".", 6870 "Type": "String", 6871 "Description": "Variant tags based on annotation criteria", 6872 }, 6873 f"{pz_prefix}Score": { 6874 "ID": f"{pz_prefix}Score", 6875 "Number": 1, 6876 "Type": "Integer", 6877 "Description": "Variant score based on annotation criteria", 6878 }, 6879 f"{pz_prefix}Flag": { 6880 "ID": f"{pz_prefix}Flag", 6881 "Number": 1, 6882 "Type": "String", 6883 "Description": "Variant flag based on annotation criteria", 6884 }, 6885 f"{pz_prefix}Comment": { 6886 "ID": f"{pz_prefix}Comment", 6887 "Number": ".", 6888 "Type": "String", 6889 "Description": "Variant comment based on annotation criteria", 6890 }, 6891 f"{pz_prefix}Infos": { 6892 "ID": f"{pz_prefix}Infos", 6893 "Number": ".", 6894 "Type": "String", 6895 "Description": "Variant infos based on annotation criteria", 6896 }, 6897 } 6898 6899 # Create INFO fields if not exist 6900 for field in PZfields_INFOS: 6901 field_ID = PZfields_INFOS[field]["ID"] 6902 field_description = PZfields_INFOS[field]["Description"] 6903 if field_ID not in self.get_header().infos and field_ID in pzfields: 6904 field_description = ( 6905 PZfields_INFOS[field]["Description"] 6906 + f", profile {default_profile}" 6907 ) 6908 self.get_header().infos[field_ID] = vcf.parser._Info( 6909 field_ID, 6910 PZfields_INFOS[field]["Number"], 6911 PZfields_INFOS[field]["Type"], 6912 field_description, 6913 "unknown", 6914 "unknown", 6915 code_type_map[PZfields_INFOS[field]["Type"]], 6916 ) 6917 6918 # Create INFO fields if not exist for each profile 6919 for profile in prioritizations_config: 6920 if profile in profiles or profiles == []: 6921 for field in PZfields_INFOS: 6922 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6923 field_description = ( 6924 PZfields_INFOS[field]["Description"] 6925 + f", profile {profile}" 6926 ) 6927 if ( 6928 field_ID not in self.get_header().infos 6929 and field in pzfields 6930 ): 6931 self.get_header().infos[field_ID] = vcf.parser._Info( 6932 field_ID, 6933 PZfields_INFOS[field]["Number"], 6934 PZfields_INFOS[field]["Type"], 6935 field_description, 6936 "unknown", 6937 "unknown", 6938 code_type_map[PZfields_INFOS[field]["Type"]], 6939 ) 6940 6941 # Header 6942 for pzfield in list_of_pzfields: 6943 if re.match(f"{pz_prefix}Score.*", pzfield): 6944 added_column = self.add_column( 6945 table_name=table_variants, 6946 column_name=pzfield, 6947 column_type="INTEGER", 6948 default_value="0", 6949 ) 6950 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6951 added_column = self.add_column( 6952 table_name=table_variants, 6953 column_name=pzfield, 6954 column_type="BOOLEAN", 6955 default_value="1", 6956 ) 6957 else: 6958 added_column = self.add_column( 6959 table_name=table_variants, 6960 column_name=pzfield, 6961 column_type="STRING", 6962 default_value="''", 6963 ) 6964 added_columns.append(added_column) 6965 6966 # Profiles 6967 if profiles: 6968 6969 # foreach profile in configuration file 6970 for profile in prioritizations_config: 6971 6972 # If profile is asked in param, or ALL are asked (empty profile []) 6973 if profile in profiles or profiles == []: 6974 log.info(f"Profile '{profile}'") 6975 6976 sql_set_info_option = "" 6977 6978 sql_set_info = [] 6979 6980 # PZ fields set 6981 6982 # PZScore 6983 if ( 6984 f"{pz_prefix}Score{pzfields_sep}{profile}" 6985 in list_of_pzfields 6986 ): 6987 sql_set_info.append( 6988 f""" 6989 concat( 6990 '{pz_prefix}Score{pzfields_sep}{profile}=', 6991 {pz_prefix}Score{pzfields_sep}{profile} 6992 ) 6993 """ 6994 ) 6995 if ( 6996 profile == default_profile 6997 and f"{pz_prefix}Score" in list_of_pzfields 6998 ): 6999 sql_set_info.append( 7000 f""" 7001 concat( 7002 '{pz_prefix}Score=', 7003 {pz_prefix}Score{pzfields_sep}{profile} 7004 ) 7005 """ 7006 ) 7007 7008 # PZFlag 7009 if ( 7010 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7011 in list_of_pzfields 7012 ): 7013 sql_set_info.append( 7014 f""" 7015 concat( 7016 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7017 CASE 7018 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7019 THEN 'PASS' 7020 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7021 THEN 'FILTERED' 7022 END 7023 ) 7024 """ 7025 ) 7026 if ( 7027 profile == default_profile 7028 and f"{pz_prefix}Flag" in list_of_pzfields 7029 ): 7030 sql_set_info.append( 7031 f""" 7032 concat( 7033 '{pz_prefix}Flag=', 7034 CASE 7035 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7036 THEN 'PASS' 7037 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7038 THEN 'FILTERED' 7039 END 7040 ) 7041 """ 7042 ) 7043 7044 # PZComment 7045 if ( 7046 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7047 in list_of_pzfields 7048 ): 7049 sql_set_info.append( 7050 f""" 7051 CASE 7052 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7053 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7054 ELSE '' 7055 END 7056 """ 7057 ) 7058 if ( 7059 profile == default_profile 7060 and f"{pz_prefix}Comment" in list_of_pzfields 7061 ): 7062 sql_set_info.append( 7063 f""" 7064 CASE 7065 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7066 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7067 ELSE '' 7068 END 7069 """ 7070 ) 7071 7072 # PZInfos 7073 if ( 7074 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7075 in list_of_pzfields 7076 ): 7077 sql_set_info.append( 7078 f""" 7079 CASE 7080 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7081 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7082 ELSE '' 7083 END 7084 """ 7085 ) 7086 if ( 7087 profile == default_profile 7088 and f"{pz_prefix}Infos" in list_of_pzfields 7089 ): 7090 sql_set_info.append( 7091 f""" 7092 CASE 7093 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7094 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7095 ELSE '' 7096 END 7097 """ 7098 ) 7099 7100 # Merge PZfields 7101 sql_set_info_option = "" 7102 sql_set_sep = "" 7103 for sql_set in sql_set_info: 7104 if sql_set_sep: 7105 sql_set_info_option += f""" 7106 , concat('{sql_set_sep}', {sql_set}) 7107 """ 7108 else: 7109 sql_set_info_option += f""" 7110 , {sql_set} 7111 """ 7112 sql_set_sep = ";" 7113 7114 sql_queries = [] 7115 for annotation in prioritizations_config[profile]: 7116 7117 # Explode specific annotation 7118 log.debug(f"Explode annotation '{annotation}'") 7119 added_columns += self.explode_infos( 7120 prefix=explode_infos_prefix, 7121 fields=[annotation], 7122 table=table_variants, 7123 ) 7124 extra_infos = self.get_extra_infos(table=table_variants) 7125 7126 # Check if annotation field is present 7127 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 7128 log.debug(f"Annotation '{annotation}' not in data") 7129 continue 7130 else: 7131 log.debug(f"Annotation '{annotation}' in data") 7132 7133 # For each criterions 7134 for criterion in prioritizations_config[profile][ 7135 annotation 7136 ]: 7137 criterion_type = criterion["type"] 7138 criterion_value = criterion["value"] 7139 criterion_score = criterion.get("score", 0) 7140 criterion_flag = criterion.get("flag", "PASS") 7141 criterion_flag_bool = criterion_flag == "PASS" 7142 criterion_comment = ( 7143 ", ".join(criterion.get("comment", [])) 7144 .replace("'", "''") 7145 .replace(";", ",") 7146 .replace("\t", " ") 7147 ) 7148 criterion_infos = ( 7149 str(criterion) 7150 .replace("'", "''") 7151 .replace(";", ",") 7152 .replace("\t", " ") 7153 ) 7154 7155 sql_set = [] 7156 sql_set_info = [] 7157 7158 # PZ fields set 7159 if ( 7160 f"{pz_prefix}Score{pzfields_sep}{profile}" 7161 in list_of_pzfields 7162 ): 7163 if prioritization_score_mode == "HOWARD": 7164 sql_set.append( 7165 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7166 ) 7167 elif prioritization_score_mode == "VaRank": 7168 sql_set.append( 7169 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7170 ) 7171 else: 7172 sql_set.append( 7173 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7174 ) 7175 if ( 7176 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7177 in list_of_pzfields 7178 ): 7179 sql_set.append( 7180 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7181 ) 7182 if ( 7183 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7184 in list_of_pzfields 7185 ): 7186 sql_set.append( 7187 f""" 7188 {pz_prefix}Comment{pzfields_sep}{profile} = 7189 concat( 7190 {pz_prefix}Comment{pzfields_sep}{profile}, 7191 CASE 7192 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7193 THEN ', ' 7194 ELSE '' 7195 END, 7196 '{criterion_comment}' 7197 ) 7198 """ 7199 ) 7200 if ( 7201 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7202 in list_of_pzfields 7203 ): 7204 sql_set.append( 7205 f""" 7206 {pz_prefix}Infos{pzfields_sep}{profile} = 7207 concat( 7208 {pz_prefix}Infos{pzfields_sep}{profile}, 7209 '{criterion_infos}' 7210 ) 7211 """ 7212 ) 7213 sql_set_option = ",".join(sql_set) 7214 7215 # Criterion and comparison 7216 if sql_set_option: 7217 try: 7218 float(criterion_value) 7219 sql_update = f""" 7220 UPDATE {table_variants} 7221 SET {sql_set_option} 7222 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7223 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7224 """ 7225 except: 7226 contains_option = "" 7227 if criterion_type == "contains": 7228 contains_option = ".*" 7229 sql_update = f""" 7230 UPDATE {table_variants} 7231 SET {sql_set_option} 7232 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7233 """ 7234 sql_queries.append(sql_update) 7235 else: 7236 log.warning( 7237 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7238 ) 7239 7240 # PZTags 7241 if ( 7242 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7243 in list_of_pzfields 7244 ): 7245 7246 # Create PZFalgs value 7247 pztags_value = "" 7248 pztags_sep_default = "|" 7249 pztags_sep = "" 7250 for pzfield in pzfields: 7251 if pzfield not in [f"{pz_prefix}Tags"]: 7252 if ( 7253 f"{pzfield}{pzfields_sep}{profile}" 7254 in list_of_pzfields 7255 ): 7256 if pzfield in [f"{pz_prefix}Flag"]: 7257 pztags_value += f"""{pztags_sep}{pzfield}#', 7258 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7259 THEN 'PASS' 7260 ELSE 'FILTERED' 7261 END, '""" 7262 else: 7263 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7264 pztags_sep = pztags_sep_default 7265 7266 # Add Query update for PZFlags 7267 sql_update_pztags = f""" 7268 UPDATE {table_variants} 7269 SET INFO = concat( 7270 INFO, 7271 CASE WHEN INFO NOT in ('','.') 7272 THEN ';' 7273 ELSE '' 7274 END, 7275 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7276 ) 7277 """ 7278 sql_queries.append(sql_update_pztags) 7279 7280 # Add Query update for PZFlags for default 7281 if profile == default_profile: 7282 sql_update_pztags_default = f""" 7283 UPDATE {table_variants} 7284 SET INFO = concat( 7285 INFO, 7286 ';', 7287 '{pz_prefix}Tags={pztags_value}' 7288 ) 7289 """ 7290 sql_queries.append(sql_update_pztags_default) 7291 7292 log.info(f"""Profile '{profile}' - Prioritization... """) 7293 7294 if sql_queries: 7295 7296 for sql_query in sql_queries: 7297 log.debug( 7298 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7299 ) 7300 self.conn.execute(sql_query) 7301 7302 log.info(f"""Profile '{profile}' - Update... """) 7303 sql_query_update = f""" 7304 UPDATE {table_variants} 7305 SET INFO = 7306 concat( 7307 CASE 7308 WHEN INFO NOT IN ('','.') 7309 THEN concat(INFO, ';') 7310 ELSE '' 7311 END 7312 {sql_set_info_option} 7313 ) 7314 """ 7315 self.conn.execute(sql_query_update) 7316 7317 else: 7318 7319 log.warning(f"No profiles in parameters") 7320 7321 # Remove added columns 7322 for added_column in added_columns: 7323 self.drop_column(column=added_column) 7324 7325 # Explode INFOS fields into table fields 7326 if self.get_explode_infos(): 7327 self.explode_infos( 7328 prefix=self.get_explode_infos_prefix(), 7329 fields=self.get_explode_infos_fields(), 7330 force=True, 7331 ) 7332 7333 return True 7334 7335 ### 7336 # HGVS 7337 ### 7338 7339 def annotation_hgvs(self, threads: int = None) -> None: 7340 """ 7341 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7342 coordinates and alleles. 7343 7344 :param threads: The `threads` parameter is an optional integer that specifies the number of 7345 threads to use for parallel processing. If no value is provided, it will default to the number 7346 of threads obtained from the `get_threads()` method 7347 :type threads: int 7348 """ 7349 7350 # Function for each partition of the Dask Dataframe 7351 def partition_function(partition): 7352 """ 7353 The function `partition_function` applies the `annotation_hgvs_partition` function to 7354 each row of a DataFrame called `partition`. 7355 7356 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7357 to be processed 7358 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7359 the "partition" dataframe along the axis 1. 7360 """ 7361 return partition.apply(annotation_hgvs_partition, axis=1) 7362 7363 def annotation_hgvs_partition(row) -> str: 7364 """ 7365 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7366 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7367 7368 :param row: A dictionary-like object that contains the values for the following keys: 7369 :return: a string that contains the HGVS names associated with the given row of data. 7370 """ 7371 7372 chr = row["CHROM"] 7373 pos = row["POS"] 7374 ref = row["REF"] 7375 alt = row["ALT"] 7376 7377 # Find list of associated transcripts 7378 transcripts_list = list( 7379 polars_conn.execute( 7380 f""" 7381 SELECT transcript 7382 FROM refseq_df 7383 WHERE CHROM='{chr}' 7384 AND POS={pos} 7385 """ 7386 )["transcript"] 7387 ) 7388 7389 # Full HGVS annotation in list 7390 hgvs_full_list = [] 7391 7392 for transcript_name in transcripts_list: 7393 7394 # Transcript 7395 transcript = get_transcript( 7396 transcripts=transcripts, transcript_name=transcript_name 7397 ) 7398 # Exon 7399 if use_exon: 7400 exon = transcript.find_exon_number(pos) 7401 else: 7402 exon = None 7403 # Protein 7404 transcript_protein = None 7405 if use_protein or add_protein or full_format: 7406 transcripts_protein = list( 7407 polars_conn.execute( 7408 f""" 7409 SELECT protein 7410 FROM refseqlink_df 7411 WHERE transcript='{transcript_name}' 7412 LIMIT 1 7413 """ 7414 )["protein"] 7415 ) 7416 if len(transcripts_protein): 7417 transcript_protein = transcripts_protein[0] 7418 7419 # HGVS name 7420 hgvs_name = format_hgvs_name( 7421 chr, 7422 pos, 7423 ref, 7424 alt, 7425 genome=genome, 7426 transcript=transcript, 7427 transcript_protein=transcript_protein, 7428 exon=exon, 7429 use_gene=use_gene, 7430 use_protein=use_protein, 7431 full_format=full_format, 7432 use_version=use_version, 7433 codon_type=codon_type, 7434 ) 7435 hgvs_full_list.append(hgvs_name) 7436 if add_protein and not use_protein and not full_format: 7437 hgvs_name = format_hgvs_name( 7438 chr, 7439 pos, 7440 ref, 7441 alt, 7442 genome=genome, 7443 transcript=transcript, 7444 transcript_protein=transcript_protein, 7445 exon=exon, 7446 use_gene=use_gene, 7447 use_protein=True, 7448 full_format=False, 7449 use_version=use_version, 7450 codon_type=codon_type, 7451 ) 7452 hgvs_full_list.append(hgvs_name) 7453 7454 # Create liste of HGVS annotations 7455 hgvs_full = ",".join(hgvs_full_list) 7456 7457 return hgvs_full 7458 7459 # Polars connexion 7460 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7461 7462 # Config 7463 config = self.get_config() 7464 7465 # Databases 7466 # Genome 7467 databases_genomes_folders = ( 7468 config.get("folders", {}) 7469 .get("databases", {}) 7470 .get("genomes", DEFAULT_GENOME_FOLDER) 7471 ) 7472 databases_genome = ( 7473 config.get("folders", {}).get("databases", {}).get("genomes", "") 7474 ) 7475 # refseq database folder 7476 databases_refseq_folders = ( 7477 config.get("folders", {}) 7478 .get("databases", {}) 7479 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7480 ) 7481 # refseq 7482 databases_refseq = config.get("databases", {}).get("refSeq", None) 7483 # refSeqLink 7484 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7485 7486 # Param 7487 param = self.get_param() 7488 7489 # Quick HGVS 7490 if "hgvs_options" in param and param.get("hgvs_options", ""): 7491 log.info(f"Quick HGVS Annotation:") 7492 if not param.get("hgvs", None): 7493 param["hgvs"] = {} 7494 for option in param.get("hgvs_options", "").split(","): 7495 option_var_val = option.split("=") 7496 option_var = option_var_val[0] 7497 if len(option_var_val) > 1: 7498 option_val = option_var_val[1] 7499 else: 7500 option_val = "True" 7501 if option_val.upper() in ["TRUE"]: 7502 option_val = True 7503 elif option_val.upper() in ["FALSE"]: 7504 option_val = False 7505 log.info(f" {option_var}={option_val}") 7506 param["hgvs"][option_var] = option_val 7507 7508 # Check if HGVS annotation enabled 7509 if "hgvs" in param: 7510 log.info(f"HGVS Annotation... ") 7511 for hgvs_option in param.get("hgvs", {}): 7512 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7513 else: 7514 return 7515 7516 # HGVS Param 7517 param_hgvs = param.get("hgvs", {}) 7518 use_exon = param_hgvs.get("use_exon", False) 7519 use_gene = param_hgvs.get("use_gene", False) 7520 use_protein = param_hgvs.get("use_protein", False) 7521 add_protein = param_hgvs.get("add_protein", False) 7522 full_format = param_hgvs.get("full_format", False) 7523 use_version = param_hgvs.get("use_version", False) 7524 codon_type = param_hgvs.get("codon_type", "3") 7525 7526 # refSseq refSeqLink 7527 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7528 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7529 7530 # Assembly 7531 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7532 7533 # Genome 7534 genome_file = None 7535 if find_genome(databases_genome): 7536 genome_file = find_genome(databases_genome) 7537 else: 7538 genome_file = find_genome( 7539 genome_path=databases_genomes_folders, assembly=assembly 7540 ) 7541 log.debug("Genome: " + str(genome_file)) 7542 7543 # refSseq 7544 refseq_file = find_file_prefix( 7545 input_file=databases_refseq, 7546 prefix="ncbiRefSeq", 7547 folder=databases_refseq_folders, 7548 assembly=assembly, 7549 ) 7550 log.debug("refSeq: " + str(refseq_file)) 7551 7552 # refSeqLink 7553 refseqlink_file = find_file_prefix( 7554 input_file=databases_refseqlink, 7555 prefix="ncbiRefSeqLink", 7556 folder=databases_refseq_folders, 7557 assembly=assembly, 7558 ) 7559 log.debug("refSeqLink: " + str(refseqlink_file)) 7560 7561 # Threads 7562 if not threads: 7563 threads = self.get_threads() 7564 log.debug("Threads: " + str(threads)) 7565 7566 # Variables 7567 table_variants = self.get_table_variants(clause="update") 7568 7569 # Get variants SNV and InDel only 7570 query_variants = f""" 7571 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7572 FROM {table_variants} 7573 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7574 """ 7575 df_variants = self.get_query_to_df(query_variants) 7576 7577 # Added columns 7578 added_columns = [] 7579 7580 # Add hgvs column in variants table 7581 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7582 added_column = self.add_column( 7583 table_variants, hgvs_column_name, "STRING", default_value=None 7584 ) 7585 added_columns.append(added_column) 7586 7587 log.debug(f"refSeq loading...") 7588 # refSeq in duckDB 7589 refseq_table = get_refseq_table( 7590 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7591 ) 7592 # Loading all refSeq in Dataframe 7593 refseq_query = f""" 7594 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7595 FROM {refseq_table} 7596 JOIN df_variants ON ( 7597 {refseq_table}.chrom = df_variants.CHROM 7598 AND {refseq_table}.txStart<=df_variants.POS 7599 AND {refseq_table}.txEnd>=df_variants.POS 7600 ) 7601 """ 7602 refseq_df = self.conn.query(refseq_query).pl() 7603 7604 if refseqlink_file: 7605 log.debug(f"refSeqLink loading...") 7606 # refSeqLink in duckDB 7607 refseqlink_table = get_refseq_table( 7608 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7609 ) 7610 # Loading all refSeqLink in Dataframe 7611 protacc_column = "protAcc_with_ver" 7612 mrnaacc_column = "mrnaAcc_with_ver" 7613 refseqlink_query = f""" 7614 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7615 FROM {refseqlink_table} 7616 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7617 WHERE protAcc_without_ver IS NOT NULL 7618 """ 7619 # Polars Dataframe 7620 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7621 7622 # Read RefSeq transcripts into a python dict/model. 7623 log.debug(f"Transcripts loading...") 7624 with tempfile.TemporaryDirectory() as tmpdir: 7625 transcripts_query = f""" 7626 COPY ( 7627 SELECT {refseq_table}.* 7628 FROM {refseq_table} 7629 JOIN df_variants ON ( 7630 {refseq_table}.chrom=df_variants.CHROM 7631 AND {refseq_table}.txStart<=df_variants.POS 7632 AND {refseq_table}.txEnd>=df_variants.POS 7633 ) 7634 ) 7635 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7636 """ 7637 self.conn.query(transcripts_query) 7638 with open(f"{tmpdir}/transcript.tsv") as infile: 7639 transcripts = read_transcripts(infile) 7640 7641 # Polars connexion 7642 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7643 7644 log.debug("Genome loading...") 7645 # Read genome sequence using pyfaidx. 7646 genome = Fasta(genome_file) 7647 7648 log.debug("Start annotation HGVS...") 7649 7650 # Create 7651 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7652 ddf = dd.from_pandas(df_variants, npartitions=threads) 7653 7654 # Use dask.dataframe.apply() to apply function on each partition 7655 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7656 7657 # Convert Dask DataFrame to Pandas Dataframe 7658 df = ddf.compute() 7659 7660 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7661 with tempfile.TemporaryDirectory() as tmpdir: 7662 df_parquet = os.path.join(tmpdir, "df.parquet") 7663 df.to_parquet(df_parquet) 7664 7665 # Update hgvs column 7666 update_variant_query = f""" 7667 UPDATE {table_variants} 7668 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7669 FROM read_parquet('{df_parquet}') as df 7670 WHERE variants."#CHROM" = df.CHROM 7671 AND variants.POS = df.POS 7672 AND variants.REF = df.REF 7673 AND variants.ALT = df.ALT 7674 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7675 """ 7676 self.execute_query(update_variant_query) 7677 7678 # Update INFO column 7679 sql_query_update = f""" 7680 UPDATE {table_variants} 7681 SET INFO = 7682 concat( 7683 CASE 7684 WHEN INFO NOT IN ('','.') 7685 THEN concat(INFO, ';') 7686 ELSE '' 7687 END, 7688 'hgvs=', 7689 {hgvs_column_name} 7690 ) 7691 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7692 """ 7693 self.execute_query(sql_query_update) 7694 7695 # Add header 7696 HGVS_INFOS = { 7697 "hgvs": { 7698 "ID": "hgvs", 7699 "Number": ".", 7700 "Type": "String", 7701 "Description": f"HGVS annotatation with HOWARD", 7702 } 7703 } 7704 7705 for field in HGVS_INFOS: 7706 field_ID = HGVS_INFOS[field]["ID"] 7707 field_description = HGVS_INFOS[field]["Description"] 7708 self.get_header().infos[field_ID] = vcf.parser._Info( 7709 field_ID, 7710 HGVS_INFOS[field]["Number"], 7711 HGVS_INFOS[field]["Type"], 7712 field_description, 7713 "unknown", 7714 "unknown", 7715 code_type_map[HGVS_INFOS[field]["Type"]], 7716 ) 7717 7718 # Remove added columns 7719 for added_column in added_columns: 7720 self.drop_column(column=added_column) 7721 7722 ### 7723 # Calculation 7724 ### 7725 7726 def get_operations_help( 7727 self, operations_config_dict: dict = {}, operations_config_file: str = None 7728 ) -> list: 7729 7730 # Init 7731 operations_help = [] 7732 7733 # operations 7734 operations = self.get_config_json( 7735 name="calculations", 7736 config_dict=operations_config_dict, 7737 config_file=operations_config_file, 7738 ) 7739 for op in operations: 7740 op_name = operations[op].get("name", op).upper() 7741 op_description = operations[op].get("description", op_name) 7742 op_available = operations[op].get("available", False) 7743 if op_available: 7744 operations_help.append(f" {op_name}: {op_description}") 7745 7746 # Sort operations 7747 operations_help.sort() 7748 7749 # insert header 7750 operations_help.insert(0, "Available calculation operations:") 7751 7752 # Return 7753 return operations_help 7754 7755 def calculation( 7756 self, 7757 operations: dict = {}, 7758 operations_config_dict: dict = {}, 7759 operations_config_file: str = None, 7760 ) -> None: 7761 """ 7762 It takes a list of operations, and for each operation, it checks if it's a python or sql 7763 operation, and then calls the appropriate function 7764 7765 param json example: 7766 "calculation": { 7767 "NOMEN": { 7768 "options": { 7769 "hgvs_field": "hgvs" 7770 }, 7771 "middle" : null 7772 } 7773 """ 7774 7775 # Param 7776 param = self.get_param() 7777 7778 # operations config 7779 operations_config = self.get_config_json( 7780 name="calculations", 7781 config_dict=operations_config_dict, 7782 config_file=operations_config_file, 7783 ) 7784 7785 # Upper keys 7786 operations_config = {k.upper(): v for k, v in operations_config.items()} 7787 7788 # Calculations 7789 7790 # Operations from param 7791 operations = param.get("calculation", {}).get("calculations", operations) 7792 7793 # Quick calculation - add 7794 if param.get("calculations", None): 7795 calculations_list = [ 7796 value for value in param.get("calculations", "").split(",") 7797 ] 7798 log.info(f"Quick Calculations:") 7799 for calculation_key in calculations_list: 7800 log.info(f" {calculation_key}") 7801 for calculation_operation in calculations_list: 7802 if calculation_operation.upper() not in operations: 7803 operations[calculation_operation.upper()] = {} 7804 add_value_into_dict( 7805 dict_tree=param, 7806 sections=[ 7807 "calculation", 7808 "calculations", 7809 calculation_operation.upper(), 7810 ], 7811 value={}, 7812 ) 7813 7814 # Operations for calculation 7815 if not operations: 7816 operations = param.get("calculation", {}).get("calculations", {}) 7817 7818 if operations: 7819 log.info(f"Calculations...") 7820 7821 # For each operations 7822 for operation_name in operations: 7823 operation_name = operation_name.upper() 7824 if operation_name not in [""]: 7825 if operation_name in operations_config: 7826 log.info(f"Calculation '{operation_name}'") 7827 operation = operations_config[operation_name] 7828 operation_type = operation.get("type", "sql") 7829 if operation_type == "python": 7830 self.calculation_process_function( 7831 operation=operation, operation_name=operation_name 7832 ) 7833 elif operation_type == "sql": 7834 self.calculation_process_sql( 7835 operation=operation, operation_name=operation_name 7836 ) 7837 else: 7838 log.error( 7839 f"Operations config: Type '{operation_type}' NOT available" 7840 ) 7841 raise ValueError( 7842 f"Operations config: Type '{operation_type}' NOT available" 7843 ) 7844 else: 7845 log.error( 7846 f"Operations config: Calculation '{operation_name}' NOT available" 7847 ) 7848 raise ValueError( 7849 f"Operations config: Calculation '{operation_name}' NOT available" 7850 ) 7851 7852 # Explode INFOS fields into table fields 7853 if self.get_explode_infos(): 7854 self.explode_infos( 7855 prefix=self.get_explode_infos_prefix(), 7856 fields=self.get_explode_infos_fields(), 7857 force=True, 7858 ) 7859 7860 def calculation_process_sql( 7861 self, operation: dict, operation_name: str = "unknown" 7862 ) -> None: 7863 """ 7864 The `calculation_process_sql` function takes in a mathematical operation as a string and 7865 performs the operation, updating the specified table with the result. 7866 7867 :param operation: The `operation` parameter is a dictionary that contains information about the 7868 mathematical operation to be performed. It includes the following keys: 7869 :type operation: dict 7870 :param operation_name: The `operation_name` parameter is a string that represents the name of 7871 the mathematical operation being performed. It is used for logging and error handling purposes, 7872 defaults to unknown 7873 :type operation_name: str (optional) 7874 """ 7875 7876 # table variants 7877 table_variants = self.get_table_variants(clause="alter") 7878 7879 # Operation infos 7880 operation_name = operation.get("name", "unknown") 7881 log.debug(f"process sql {operation_name}") 7882 output_column_name = operation.get("output_column_name", operation_name) 7883 output_column_type = operation.get("output_column_type", "String") 7884 prefix = operation.get("explode_infos_prefix", "") 7885 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7886 output_column_description = operation.get( 7887 "output_column_description", f"{operation_name} operation" 7888 ) 7889 operation_query = operation.get("operation_query", None) 7890 if isinstance(operation_query, list): 7891 operation_query = " ".join(operation_query) 7892 operation_info_fields = operation.get("info_fields", []) 7893 operation_info_fields_check = operation.get("info_fields_check", False) 7894 operation_info = operation.get("operation_info", True) 7895 7896 if operation_query: 7897 7898 # Info fields check 7899 operation_info_fields_check_result = True 7900 if operation_info_fields_check: 7901 header_infos = self.get_header().infos 7902 for info_field in operation_info_fields: 7903 operation_info_fields_check_result = ( 7904 operation_info_fields_check_result 7905 and info_field in header_infos 7906 ) 7907 7908 # If info fields available 7909 if operation_info_fields_check_result: 7910 7911 # Added_columns 7912 added_columns = [] 7913 7914 # Create VCF header field 7915 vcf_reader = self.get_header() 7916 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7917 output_column_name, 7918 ".", 7919 output_column_type, 7920 output_column_description, 7921 "howard calculation", 7922 "0", 7923 self.code_type_map.get(output_column_type), 7924 ) 7925 7926 # Explode infos if needed 7927 log.debug(f"calculation_process_sql prefix {prefix}") 7928 added_columns += self.explode_infos( 7929 prefix=prefix, 7930 fields=[output_column_name] + operation_info_fields, 7931 force=True, 7932 ) 7933 7934 # Create column 7935 added_column = self.add_column( 7936 table_name=table_variants, 7937 column_name=prefix + output_column_name, 7938 column_type=output_column_type_sql, 7939 default_value="null", 7940 ) 7941 added_columns.append(added_column) 7942 7943 # Operation calculation 7944 try: 7945 7946 # Query to update calculation column 7947 sql_update = f""" 7948 UPDATE {table_variants} 7949 SET "{prefix}{output_column_name}" = ({operation_query}) 7950 """ 7951 self.conn.execute(sql_update) 7952 7953 # Add to INFO 7954 if operation_info: 7955 sql_update_info = f""" 7956 UPDATE {table_variants} 7957 SET "INFO" = 7958 concat( 7959 CASE 7960 WHEN "INFO" IS NOT NULL 7961 THEN concat("INFO", ';') 7962 ELSE '' 7963 END, 7964 '{output_column_name}=', 7965 "{prefix}{output_column_name}" 7966 ) 7967 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7968 """ 7969 self.conn.execute(sql_update_info) 7970 7971 except: 7972 log.error( 7973 f"Operations config: Calculation '{operation_name}' query failed" 7974 ) 7975 raise ValueError( 7976 f"Operations config: Calculation '{operation_name}' query failed" 7977 ) 7978 7979 # Remove added columns 7980 for added_column in added_columns: 7981 log.debug(f"added_column: {added_column}") 7982 self.drop_column(column=added_column) 7983 7984 else: 7985 log.error( 7986 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7987 ) 7988 raise ValueError( 7989 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7990 ) 7991 7992 else: 7993 log.error( 7994 f"Operations config: Calculation '{operation_name}' query NOT defined" 7995 ) 7996 raise ValueError( 7997 f"Operations config: Calculation '{operation_name}' query NOT defined" 7998 ) 7999 8000 def calculation_process_function( 8001 self, operation: dict, operation_name: str = "unknown" 8002 ) -> None: 8003 """ 8004 The `calculation_process_function` takes in an operation dictionary and performs the specified 8005 function with the given parameters. 8006 8007 :param operation: The `operation` parameter is a dictionary that contains information about the 8008 operation to be performed. It has the following keys: 8009 :type operation: dict 8010 :param operation_name: The `operation_name` parameter is a string that represents the name of 8011 the operation being performed. It is used for logging purposes, defaults to unknown 8012 :type operation_name: str (optional) 8013 """ 8014 8015 operation_name = operation["name"] 8016 log.debug(f"process sql {operation_name}") 8017 function_name = operation["function_name"] 8018 function_params = operation["function_params"] 8019 getattr(self, function_name)(*function_params) 8020 8021 def calculation_variant_id(self) -> None: 8022 """ 8023 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8024 updates the INFO field of a variants table with the variant ID. 8025 """ 8026 8027 # variant_id annotation field 8028 variant_id_tag = self.get_variant_id_column() 8029 added_columns = [variant_id_tag] 8030 8031 # variant_id hgvs tags" 8032 vcf_infos_tags = { 8033 variant_id_tag: "howard variant ID annotation", 8034 } 8035 8036 # Variants table 8037 table_variants = self.get_table_variants() 8038 8039 # Header 8040 vcf_reader = self.get_header() 8041 8042 # Add variant_id to header 8043 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8044 variant_id_tag, 8045 ".", 8046 "String", 8047 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8048 "howard calculation", 8049 "0", 8050 self.code_type_map.get("String"), 8051 ) 8052 8053 # Update 8054 sql_update = f""" 8055 UPDATE {table_variants} 8056 SET "INFO" = 8057 concat( 8058 CASE 8059 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8060 THEN '' 8061 ELSE concat("INFO", ';') 8062 END, 8063 '{variant_id_tag}=', 8064 "{variant_id_tag}" 8065 ) 8066 """ 8067 self.conn.execute(sql_update) 8068 8069 # Remove added columns 8070 for added_column in added_columns: 8071 self.drop_column(column=added_column) 8072 8073 def calculation_extract_snpeff_hgvs( 8074 self, 8075 snpeff_hgvs: str = "snpeff_hgvs", 8076 snpeff_field: str = "ANN", 8077 ) -> None: 8078 """ 8079 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8080 annotation field in a VCF file and adds them as a new column in the variants table. 8081 8082 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8083 function is used to specify the name of the column that will store the HGVS nomenclatures 8084 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8085 snpeff_hgvs 8086 :type snpeff_hgvs: str (optional) 8087 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8088 function represents the field in the VCF file that contains SnpEff annotations. This field is 8089 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8090 to ANN 8091 :type snpeff_field: str (optional) 8092 """ 8093 8094 # Snpeff hgvs tags 8095 vcf_infos_tags = { 8096 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8097 } 8098 8099 # Prefix 8100 prefix = self.get_explode_infos_prefix() 8101 if prefix: 8102 prefix = "INFO/" 8103 8104 # snpEff fields 8105 speff_ann_infos = prefix + snpeff_field 8106 speff_hgvs_infos = prefix + snpeff_hgvs 8107 8108 # Variants table 8109 table_variants = self.get_table_variants() 8110 8111 # Header 8112 vcf_reader = self.get_header() 8113 8114 # Add columns 8115 added_columns = [] 8116 8117 # Explode HGVS field in column 8118 added_columns += self.explode_infos(fields=[snpeff_field]) 8119 8120 if snpeff_field in vcf_reader.infos: 8121 8122 log.debug(vcf_reader.infos[snpeff_field]) 8123 8124 # Extract ANN header 8125 ann_description = vcf_reader.infos[snpeff_field].desc 8126 pattern = r"'(.+?)'" 8127 match = re.search(pattern, ann_description) 8128 if match: 8129 ann_header_match = match.group(1).split(" | ") 8130 ann_header_desc = {} 8131 for i in range(len(ann_header_match)): 8132 ann_header_info = "".join( 8133 char for char in ann_header_match[i] if char.isalnum() 8134 ) 8135 ann_header_desc[ann_header_info] = ann_header_match[i] 8136 if not ann_header_desc: 8137 raise ValueError("Invalid header description format") 8138 else: 8139 raise ValueError("Invalid header description format") 8140 8141 # Create variant id 8142 variant_id_column = self.get_variant_id_column() 8143 added_columns += [variant_id_column] 8144 8145 # Create dataframe 8146 dataframe_snpeff_hgvs = self.get_query_to_df( 8147 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8148 ) 8149 8150 # Create main NOMEN column 8151 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8152 speff_ann_infos 8153 ].apply( 8154 lambda x: extract_snpeff_hgvs( 8155 str(x), header=list(ann_header_desc.values()) 8156 ) 8157 ) 8158 8159 # Add snpeff_hgvs to header 8160 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8161 snpeff_hgvs, 8162 ".", 8163 "String", 8164 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8165 "howard calculation", 8166 "0", 8167 self.code_type_map.get("String"), 8168 ) 8169 8170 # Update 8171 sql_update = f""" 8172 UPDATE variants 8173 SET "INFO" = 8174 concat( 8175 CASE 8176 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8177 THEN '' 8178 ELSE concat("INFO", ';') 8179 END, 8180 CASE 8181 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8182 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8183 THEN concat( 8184 '{snpeff_hgvs}=', 8185 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8186 ) 8187 ELSE '' 8188 END 8189 ) 8190 FROM dataframe_snpeff_hgvs 8191 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8192 8193 """ 8194 self.conn.execute(sql_update) 8195 8196 # Delete dataframe 8197 del dataframe_snpeff_hgvs 8198 gc.collect() 8199 8200 else: 8201 8202 log.warning( 8203 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8204 ) 8205 8206 # Remove added columns 8207 for added_column in added_columns: 8208 self.drop_column(column=added_column) 8209 8210 def calculation_snpeff_ann_explode( 8211 self, 8212 uniquify: bool = True, 8213 output_format: str = "fields", 8214 output_prefix: str = "snpeff_", 8215 snpeff_field: str = "ANN", 8216 ) -> None: 8217 """ 8218 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8219 exploding the HGVS field and updating variant information accordingly. 8220 8221 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8222 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8223 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8224 defaults to True 8225 :type uniquify: bool (optional) 8226 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8227 function specifies the format in which the output annotations will be generated. It has a 8228 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8229 format, defaults to fields 8230 :type output_format: str (optional) 8231 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8232 method is used to specify the prefix that will be added to the output annotations generated 8233 during the calculation process. This prefix helps to differentiate the newly added annotations 8234 from existing ones in the output data. By default, the, defaults to ANN_ 8235 :type output_prefix: str (optional) 8236 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8237 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8238 field will be processed to explode the HGVS annotations and update the variant information 8239 accordingly, defaults to ANN 8240 :type snpeff_field: str (optional) 8241 """ 8242 8243 # SnpEff annotation field 8244 snpeff_hgvs = "snpeff_ann_explode" 8245 8246 # Snpeff hgvs tags 8247 vcf_infos_tags = { 8248 snpeff_hgvs: "Explode snpEff annotations", 8249 } 8250 8251 # Prefix 8252 prefix = self.get_explode_infos_prefix() 8253 if prefix: 8254 prefix = "INFO/" 8255 8256 # snpEff fields 8257 speff_ann_infos = prefix + snpeff_field 8258 speff_hgvs_infos = prefix + snpeff_hgvs 8259 8260 # Variants table 8261 table_variants = self.get_table_variants() 8262 8263 # Header 8264 vcf_reader = self.get_header() 8265 8266 # Add columns 8267 added_columns = [] 8268 8269 # Explode HGVS field in column 8270 added_columns += self.explode_infos(fields=[snpeff_field]) 8271 log.debug(f"snpeff_field={snpeff_field}") 8272 log.debug(f"added_columns={added_columns}") 8273 8274 if snpeff_field in vcf_reader.infos: 8275 8276 # Extract ANN header 8277 ann_description = vcf_reader.infos[snpeff_field].desc 8278 pattern = r"'(.+?)'" 8279 match = re.search(pattern, ann_description) 8280 if match: 8281 ann_header_match = match.group(1).split(" | ") 8282 ann_header = [] 8283 ann_header_desc = {} 8284 for i in range(len(ann_header_match)): 8285 ann_header_info = "".join( 8286 char for char in ann_header_match[i] if char.isalnum() 8287 ) 8288 ann_header.append(ann_header_info) 8289 ann_header_desc[ann_header_info] = ann_header_match[i] 8290 if not ann_header_desc: 8291 raise ValueError("Invalid header description format") 8292 else: 8293 raise ValueError("Invalid header description format") 8294 8295 # Create variant id 8296 variant_id_column = self.get_variant_id_column() 8297 added_columns += [variant_id_column] 8298 8299 # Create dataframe 8300 dataframe_snpeff_hgvs = self.get_query_to_df( 8301 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8302 ) 8303 8304 # Create snpEff columns 8305 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8306 speff_ann_infos 8307 ].apply( 8308 lambda x: explode_snpeff_ann( 8309 str(x), 8310 uniquify=uniquify, 8311 output_format=output_format, 8312 prefix=output_prefix, 8313 header=list(ann_header_desc.values()), 8314 ) 8315 ) 8316 8317 # Header 8318 ann_annotations_prefix = "" 8319 if output_format.upper() in ["JSON"]: 8320 ann_annotations_prefix = f"{output_prefix}=" 8321 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8322 output_prefix, 8323 ".", 8324 "String", 8325 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8326 + " - JSON format", 8327 "howard calculation", 8328 "0", 8329 self.code_type_map.get("String"), 8330 ) 8331 else: 8332 for ann_annotation in ann_header: 8333 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8334 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8335 ann_annotation_id, 8336 ".", 8337 "String", 8338 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8339 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8340 "howard calculation", 8341 "0", 8342 self.code_type_map.get("String"), 8343 ) 8344 8345 # Update 8346 sql_update = f""" 8347 UPDATE variants 8348 SET "INFO" = 8349 concat( 8350 CASE 8351 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8352 THEN '' 8353 ELSE concat("INFO", ';') 8354 END, 8355 CASE 8356 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8357 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8358 THEN concat( 8359 '{ann_annotations_prefix}', 8360 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8361 ) 8362 ELSE '' 8363 END 8364 ) 8365 FROM dataframe_snpeff_hgvs 8366 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8367 8368 """ 8369 self.conn.execute(sql_update) 8370 8371 # Delete dataframe 8372 del dataframe_snpeff_hgvs 8373 gc.collect() 8374 8375 else: 8376 8377 log.warning( 8378 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8379 ) 8380 8381 # Remove added columns 8382 for added_column in added_columns: 8383 self.drop_column(column=added_column) 8384 8385 def calculation_extract_nomen(self) -> None: 8386 """ 8387 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8388 """ 8389 8390 # NOMEN field 8391 field_nomen_dict = "NOMEN_DICT" 8392 8393 # NOMEN structure 8394 nomen_dict = { 8395 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8396 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8397 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8398 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8399 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8400 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8401 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8402 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8403 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8404 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8405 } 8406 8407 # Param 8408 param = self.get_param() 8409 8410 # Prefix 8411 prefix = self.get_explode_infos_prefix() 8412 8413 # Header 8414 vcf_reader = self.get_header() 8415 8416 # Get HGVS field 8417 hgvs_field = ( 8418 param.get("calculation", {}) 8419 .get("calculations", {}) 8420 .get("NOMEN", {}) 8421 .get("options", {}) 8422 .get("hgvs_field", "hgvs") 8423 ) 8424 8425 # Get transcripts 8426 transcripts_file = ( 8427 param.get("calculation", {}) 8428 .get("calculations", {}) 8429 .get("NOMEN", {}) 8430 .get("options", {}) 8431 .get("transcripts", None) 8432 ) 8433 transcripts_file = full_path(transcripts_file) 8434 transcripts = [] 8435 if transcripts_file: 8436 if os.path.exists(transcripts_file): 8437 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8438 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8439 else: 8440 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8441 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8442 8443 # Added columns 8444 added_columns = [] 8445 8446 # Explode HGVS field in column 8447 added_columns += self.explode_infos(fields=[hgvs_field]) 8448 8449 # extra infos 8450 extra_infos = self.get_extra_infos() 8451 extra_field = prefix + hgvs_field 8452 8453 if extra_field in extra_infos: 8454 8455 # Create dataframe 8456 dataframe_hgvs = self.get_query_to_df( 8457 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8458 ) 8459 8460 # Create main NOMEN column 8461 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8462 lambda x: find_nomen(str(x), transcripts=transcripts) 8463 ) 8464 8465 # Explode NOMEN Structure and create SQL set for update 8466 sql_nomen_fields = [] 8467 for nomen_field in nomen_dict: 8468 8469 # Explode each field into a column 8470 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8471 lambda x: dict(x).get(nomen_field, "") 8472 ) 8473 8474 # Create VCF header field 8475 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8476 nomen_field, 8477 ".", 8478 "String", 8479 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8480 "howard calculation", 8481 "0", 8482 self.code_type_map.get("String"), 8483 ) 8484 sql_nomen_fields.append( 8485 f""" 8486 CASE 8487 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8488 THEN concat( 8489 ';{nomen_field}=', 8490 dataframe_hgvs."{nomen_field}" 8491 ) 8492 ELSE '' 8493 END 8494 """ 8495 ) 8496 8497 # SQL set for update 8498 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8499 8500 # Update 8501 sql_update = f""" 8502 UPDATE variants 8503 SET "INFO" = 8504 concat( 8505 CASE 8506 WHEN "INFO" IS NULL 8507 THEN '' 8508 ELSE "INFO" 8509 END, 8510 {sql_nomen_fields_set} 8511 ) 8512 FROM dataframe_hgvs 8513 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8514 AND variants."POS" = dataframe_hgvs."POS" 8515 AND variants."REF" = dataframe_hgvs."REF" 8516 AND variants."ALT" = dataframe_hgvs."ALT" 8517 """ 8518 self.conn.execute(sql_update) 8519 8520 # Delete dataframe 8521 del dataframe_hgvs 8522 gc.collect() 8523 8524 # Remove added columns 8525 for added_column in added_columns: 8526 self.drop_column(column=added_column) 8527 8528 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8529 """ 8530 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8531 pipeline/sample for a variant and updates the variant information in a VCF file. 8532 8533 :param tag: The `tag` parameter is a string that represents the annotation field for the 8534 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8535 VCF header and to update the corresponding field in the variants table, defaults to 8536 findbypipeline 8537 :type tag: str (optional) 8538 """ 8539 8540 # if FORMAT and samples 8541 if ( 8542 "FORMAT" in self.get_header_columns_as_list() 8543 and self.get_header_sample_list() 8544 ): 8545 8546 # findbypipeline annotation field 8547 findbypipeline_tag = tag 8548 8549 # VCF infos tags 8550 vcf_infos_tags = { 8551 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8552 } 8553 8554 # Prefix 8555 prefix = self.get_explode_infos_prefix() 8556 8557 # Field 8558 findbypipeline_infos = prefix + findbypipeline_tag 8559 8560 # Variants table 8561 table_variants = self.get_table_variants() 8562 8563 # Header 8564 vcf_reader = self.get_header() 8565 8566 # Create variant id 8567 variant_id_column = self.get_variant_id_column() 8568 added_columns = [variant_id_column] 8569 8570 # variant_id, FORMAT and samples 8571 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8572 self.get_header_sample_list() 8573 ) 8574 8575 # Create dataframe 8576 dataframe_findbypipeline = self.get_query_to_df( 8577 f""" SELECT {samples_fields} FROM {table_variants} """ 8578 ) 8579 8580 # Create findbypipeline column 8581 dataframe_findbypipeline[findbypipeline_infos] = ( 8582 dataframe_findbypipeline.apply( 8583 lambda row: findbypipeline( 8584 row, samples=self.get_header_sample_list() 8585 ), 8586 axis=1, 8587 ) 8588 ) 8589 8590 # Add snpeff_hgvs to header 8591 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8592 findbypipeline_tag, 8593 ".", 8594 "String", 8595 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8596 "howard calculation", 8597 "0", 8598 self.code_type_map.get("String"), 8599 ) 8600 8601 # Update 8602 sql_update = f""" 8603 UPDATE variants 8604 SET "INFO" = 8605 concat( 8606 CASE 8607 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8608 THEN '' 8609 ELSE concat("INFO", ';') 8610 END, 8611 CASE 8612 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8613 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8614 THEN concat( 8615 '{findbypipeline_tag}=', 8616 dataframe_findbypipeline."{findbypipeline_infos}" 8617 ) 8618 ELSE '' 8619 END 8620 ) 8621 FROM dataframe_findbypipeline 8622 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8623 """ 8624 self.conn.execute(sql_update) 8625 8626 # Remove added columns 8627 for added_column in added_columns: 8628 self.drop_column(column=added_column) 8629 8630 # Delete dataframe 8631 del dataframe_findbypipeline 8632 gc.collect() 8633 8634 def calculation_genotype_concordance(self) -> None: 8635 """ 8636 The function `calculation_genotype_concordance` calculates the genotype concordance for 8637 multi-caller VCF files and updates the variant information in the database. 8638 """ 8639 8640 # if FORMAT and samples 8641 if ( 8642 "FORMAT" in self.get_header_columns_as_list() 8643 and self.get_header_sample_list() 8644 ): 8645 8646 # genotypeconcordance annotation field 8647 genotypeconcordance_tag = "genotypeconcordance" 8648 8649 # VCF infos tags 8650 vcf_infos_tags = { 8651 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8652 } 8653 8654 # Prefix 8655 prefix = self.get_explode_infos_prefix() 8656 8657 # Field 8658 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8659 8660 # Variants table 8661 table_variants = self.get_table_variants() 8662 8663 # Header 8664 vcf_reader = self.get_header() 8665 8666 # Create variant id 8667 variant_id_column = self.get_variant_id_column() 8668 added_columns = [variant_id_column] 8669 8670 # variant_id, FORMAT and samples 8671 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8672 self.get_header_sample_list() 8673 ) 8674 8675 # Create dataframe 8676 dataframe_genotypeconcordance = self.get_query_to_df( 8677 f""" SELECT {samples_fields} FROM {table_variants} """ 8678 ) 8679 8680 # Create genotypeconcordance column 8681 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8682 dataframe_genotypeconcordance.apply( 8683 lambda row: genotypeconcordance( 8684 row, samples=self.get_header_sample_list() 8685 ), 8686 axis=1, 8687 ) 8688 ) 8689 8690 # Add genotypeconcordance to header 8691 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8692 genotypeconcordance_tag, 8693 ".", 8694 "String", 8695 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8696 "howard calculation", 8697 "0", 8698 self.code_type_map.get("String"), 8699 ) 8700 8701 # Update 8702 sql_update = f""" 8703 UPDATE variants 8704 SET "INFO" = 8705 concat( 8706 CASE 8707 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8708 THEN '' 8709 ELSE concat("INFO", ';') 8710 END, 8711 CASE 8712 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8713 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8714 THEN concat( 8715 '{genotypeconcordance_tag}=', 8716 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8717 ) 8718 ELSE '' 8719 END 8720 ) 8721 FROM dataframe_genotypeconcordance 8722 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8723 """ 8724 self.conn.execute(sql_update) 8725 8726 # Remove added columns 8727 for added_column in added_columns: 8728 self.drop_column(column=added_column) 8729 8730 # Delete dataframe 8731 del dataframe_genotypeconcordance 8732 gc.collect() 8733 8734 def calculation_barcode(self, tag: str = "barcode") -> None: 8735 """ 8736 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8737 updates the INFO field in the file with the calculated barcode values. 8738 8739 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8740 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8741 the default tag name is set to "barcode", defaults to barcode 8742 :type tag: str (optional) 8743 """ 8744 8745 # if FORMAT and samples 8746 if ( 8747 "FORMAT" in self.get_header_columns_as_list() 8748 and self.get_header_sample_list() 8749 ): 8750 8751 # barcode annotation field 8752 if not tag: 8753 tag = "barcode" 8754 8755 # VCF infos tags 8756 vcf_infos_tags = { 8757 tag: "barcode calculation (VaRank)", 8758 } 8759 8760 # Prefix 8761 prefix = self.get_explode_infos_prefix() 8762 8763 # Field 8764 barcode_infos = prefix + tag 8765 8766 # Variants table 8767 table_variants = self.get_table_variants() 8768 8769 # Header 8770 vcf_reader = self.get_header() 8771 8772 # Create variant id 8773 variant_id_column = self.get_variant_id_column() 8774 added_columns = [variant_id_column] 8775 8776 # variant_id, FORMAT and samples 8777 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8778 self.get_header_sample_list() 8779 ) 8780 8781 # Create dataframe 8782 dataframe_barcode = self.get_query_to_df( 8783 f""" SELECT {samples_fields} FROM {table_variants} """ 8784 ) 8785 8786 # Create barcode column 8787 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8788 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8789 ) 8790 8791 # Add barcode to header 8792 vcf_reader.infos[tag] = vcf.parser._Info( 8793 tag, 8794 ".", 8795 "String", 8796 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8797 "howard calculation", 8798 "0", 8799 self.code_type_map.get("String"), 8800 ) 8801 8802 # Update 8803 sql_update = f""" 8804 UPDATE {table_variants} 8805 SET "INFO" = 8806 concat( 8807 CASE 8808 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8809 THEN '' 8810 ELSE concat("INFO", ';') 8811 END, 8812 CASE 8813 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8814 AND dataframe_barcode."{barcode_infos}" NOT NULL 8815 THEN concat( 8816 '{tag}=', 8817 dataframe_barcode."{barcode_infos}" 8818 ) 8819 ELSE '' 8820 END 8821 ) 8822 FROM dataframe_barcode 8823 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8824 """ 8825 self.conn.execute(sql_update) 8826 8827 # Remove added columns 8828 for added_column in added_columns: 8829 self.drop_column(column=added_column) 8830 8831 # Delete dataframe 8832 del dataframe_barcode 8833 gc.collect() 8834 8835 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8836 """ 8837 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8838 and updates the INFO field in the file with the calculated barcode values. 8839 8840 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8841 the barcode tag that will be added to the VCF file during the calculation process. If no value 8842 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8843 :type tag: str (optional) 8844 """ 8845 8846 # if FORMAT and samples 8847 if ( 8848 "FORMAT" in self.get_header_columns_as_list() 8849 and self.get_header_sample_list() 8850 ): 8851 8852 # barcode annotation field 8853 if not tag: 8854 tag = "BCF" 8855 8856 # VCF infos tags 8857 vcf_infos_tags = { 8858 tag: "barcode family calculation", 8859 f"{tag}S": "barcode family samples", 8860 } 8861 8862 # Param 8863 param = self.get_param() 8864 log.debug(f"param={param}") 8865 8866 # Prefix 8867 prefix = self.get_explode_infos_prefix() 8868 8869 # PED param 8870 ped = ( 8871 param.get("calculation", {}) 8872 .get("calculations", {}) 8873 .get("BARCODEFAMILY", {}) 8874 .get("family_pedigree", None) 8875 ) 8876 log.debug(f"ped={ped}") 8877 8878 # Load PED 8879 if ped: 8880 8881 # Pedigree is a file 8882 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8883 log.debug("Pedigree is file") 8884 with open(full_path(ped)) as ped: 8885 ped = json.load(ped) 8886 8887 # Pedigree is a string 8888 elif isinstance(ped, str): 8889 log.debug("Pedigree is str") 8890 try: 8891 ped = json.loads(ped) 8892 log.debug("Pedigree is json str") 8893 except ValueError as e: 8894 ped_samples = ped.split(",") 8895 ped = {} 8896 for ped_sample in ped_samples: 8897 ped[ped_sample] = ped_sample 8898 8899 # Pedigree is a dict 8900 elif isinstance(ped, dict): 8901 log.debug("Pedigree is dict") 8902 8903 # Pedigree is not well formatted 8904 else: 8905 msg_error = "Pedigree not well formatted" 8906 log.error(msg_error) 8907 raise ValueError(msg_error) 8908 8909 # Construct list 8910 ped_samples = list(ped.values()) 8911 8912 else: 8913 log.debug("Pedigree not defined. Take all samples") 8914 ped_samples = self.get_header_sample_list() 8915 ped = {} 8916 for ped_sample in ped_samples: 8917 ped[ped_sample] = ped_sample 8918 8919 # Check pedigree 8920 if not ped or len(ped) == 0: 8921 msg_error = f"Error in pedigree: samples {ped_samples}" 8922 log.error(msg_error) 8923 raise ValueError(msg_error) 8924 8925 # Log 8926 log.info( 8927 "Calculation 'BARCODEFAMILY' - Samples: " 8928 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8929 ) 8930 log.debug(f"ped_samples={ped_samples}") 8931 8932 # Field 8933 barcode_infos = prefix + tag 8934 8935 # Variants table 8936 table_variants = self.get_table_variants() 8937 8938 # Header 8939 vcf_reader = self.get_header() 8940 8941 # Create variant id 8942 variant_id_column = self.get_variant_id_column() 8943 added_columns = [variant_id_column] 8944 8945 # variant_id, FORMAT and samples 8946 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8947 ped_samples 8948 ) 8949 8950 # Create dataframe 8951 dataframe_barcode = self.get_query_to_df( 8952 f""" SELECT {samples_fields} FROM {table_variants} """ 8953 ) 8954 8955 # Create barcode column 8956 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8957 lambda row: barcode(row, samples=ped_samples), axis=1 8958 ) 8959 8960 # Add barcode family to header 8961 # Add vaf_normalization to header 8962 vcf_reader.formats[tag] = vcf.parser._Format( 8963 id=tag, 8964 num=".", 8965 type="String", 8966 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8967 type_code=self.code_type_map.get("String"), 8968 ) 8969 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8970 id=f"{tag}S", 8971 num=".", 8972 type="String", 8973 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8974 type_code=self.code_type_map.get("String"), 8975 ) 8976 8977 # Update 8978 # for sample in ped_samples: 8979 sql_update_set = [] 8980 for sample in self.get_header_sample_list() + ["FORMAT"]: 8981 if sample in ped_samples: 8982 value = f'dataframe_barcode."{barcode_infos}"' 8983 value_samples = "'" + ",".join(ped_samples) + "'" 8984 elif sample == "FORMAT": 8985 value = f"'{tag}'" 8986 value_samples = f"'{tag}S'" 8987 else: 8988 value = "'.'" 8989 value_samples = "'.'" 8990 format_regex = r"[a-zA-Z0-9\s]" 8991 sql_update_set.append( 8992 f""" 8993 "{sample}" = 8994 concat( 8995 CASE 8996 WHEN {table_variants}."{sample}" = './.' 8997 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8998 ELSE {table_variants}."{sample}" 8999 END, 9000 ':', 9001 {value}, 9002 ':', 9003 {value_samples} 9004 ) 9005 """ 9006 ) 9007 9008 sql_update_set_join = ", ".join(sql_update_set) 9009 sql_update = f""" 9010 UPDATE {table_variants} 9011 SET {sql_update_set_join} 9012 FROM dataframe_barcode 9013 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9014 """ 9015 self.conn.execute(sql_update) 9016 9017 # Remove added columns 9018 for added_column in added_columns: 9019 self.drop_column(column=added_column) 9020 9021 # Delete dataframe 9022 del dataframe_barcode 9023 gc.collect() 9024 9025 def calculation_trio(self) -> None: 9026 """ 9027 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9028 information to the INFO field of each variant. 9029 """ 9030 9031 # if FORMAT and samples 9032 if ( 9033 "FORMAT" in self.get_header_columns_as_list() 9034 and self.get_header_sample_list() 9035 ): 9036 9037 # trio annotation field 9038 trio_tag = "trio" 9039 9040 # VCF infos tags 9041 vcf_infos_tags = { 9042 "trio": "trio calculation", 9043 } 9044 9045 # Param 9046 param = self.get_param() 9047 9048 # Prefix 9049 prefix = self.get_explode_infos_prefix() 9050 9051 # Trio param 9052 trio_ped = ( 9053 param.get("calculation", {}) 9054 .get("calculations", {}) 9055 .get("TRIO", {}) 9056 .get("trio_pedigree", None) 9057 ) 9058 9059 # Load trio 9060 if trio_ped: 9061 9062 # Trio pedigree is a file 9063 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9064 log.debug("TRIO pedigree is file") 9065 with open(full_path(trio_ped)) as trio_ped: 9066 trio_ped = json.load(trio_ped) 9067 9068 # Trio pedigree is a string 9069 elif isinstance(trio_ped, str): 9070 log.debug("TRIO pedigree is str") 9071 try: 9072 trio_ped = json.loads(trio_ped) 9073 log.debug("TRIO pedigree is json str") 9074 except ValueError as e: 9075 trio_samples = trio_ped.split(",") 9076 if len(trio_samples) == 3: 9077 trio_ped = { 9078 "father": trio_samples[0], 9079 "mother": trio_samples[1], 9080 "child": trio_samples[2], 9081 } 9082 log.debug("TRIO pedigree is list str") 9083 else: 9084 msg_error = "TRIO pedigree not well formatted" 9085 log.error(msg_error) 9086 raise ValueError(msg_error) 9087 9088 # Trio pedigree is a dict 9089 elif isinstance(trio_ped, dict): 9090 log.debug("TRIO pedigree is dict") 9091 9092 # Trio pedigree is not well formatted 9093 else: 9094 msg_error = "TRIO pedigree not well formatted" 9095 log.error(msg_error) 9096 raise ValueError(msg_error) 9097 9098 # Construct trio list 9099 trio_samples = [ 9100 trio_ped.get("father", ""), 9101 trio_ped.get("mother", ""), 9102 trio_ped.get("child", ""), 9103 ] 9104 9105 else: 9106 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9107 samples_list = self.get_header_sample_list() 9108 if len(samples_list) >= 3: 9109 trio_samples = self.get_header_sample_list()[0:3] 9110 trio_ped = { 9111 "father": trio_samples[0], 9112 "mother": trio_samples[1], 9113 "child": trio_samples[2], 9114 } 9115 else: 9116 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9117 log.error(msg_error) 9118 raise ValueError(msg_error) 9119 9120 # Check trio pedigree 9121 if not trio_ped or len(trio_ped) != 3: 9122 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9123 log.error(msg_error) 9124 raise ValueError(msg_error) 9125 9126 # Log 9127 log.info( 9128 f"Calculation 'TRIO' - Samples: " 9129 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9130 ) 9131 9132 # Field 9133 trio_infos = prefix + trio_tag 9134 9135 # Variants table 9136 table_variants = self.get_table_variants() 9137 9138 # Header 9139 vcf_reader = self.get_header() 9140 9141 # Create variant id 9142 variant_id_column = self.get_variant_id_column() 9143 added_columns = [variant_id_column] 9144 9145 # variant_id, FORMAT and samples 9146 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9147 self.get_header_sample_list() 9148 ) 9149 9150 # Create dataframe 9151 dataframe_trio = self.get_query_to_df( 9152 f""" SELECT {samples_fields} FROM {table_variants} """ 9153 ) 9154 9155 # Create trio column 9156 dataframe_trio[trio_infos] = dataframe_trio.apply( 9157 lambda row: trio(row, samples=trio_samples), axis=1 9158 ) 9159 9160 # Add trio to header 9161 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9162 trio_tag, 9163 ".", 9164 "String", 9165 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9166 "howard calculation", 9167 "0", 9168 self.code_type_map.get("String"), 9169 ) 9170 9171 # Update 9172 sql_update = f""" 9173 UPDATE {table_variants} 9174 SET "INFO" = 9175 concat( 9176 CASE 9177 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9178 THEN '' 9179 ELSE concat("INFO", ';') 9180 END, 9181 CASE 9182 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9183 AND dataframe_trio."{trio_infos}" NOT NULL 9184 THEN concat( 9185 '{trio_tag}=', 9186 dataframe_trio."{trio_infos}" 9187 ) 9188 ELSE '' 9189 END 9190 ) 9191 FROM dataframe_trio 9192 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9193 """ 9194 self.conn.execute(sql_update) 9195 9196 # Remove added columns 9197 for added_column in added_columns: 9198 self.drop_column(column=added_column) 9199 9200 # Delete dataframe 9201 del dataframe_trio 9202 gc.collect() 9203 9204 def calculation_vaf_normalization(self) -> None: 9205 """ 9206 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9207 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9208 :return: The function does not return anything. 9209 """ 9210 9211 # if FORMAT and samples 9212 if ( 9213 "FORMAT" in self.get_header_columns_as_list() 9214 and self.get_header_sample_list() 9215 ): 9216 9217 # vaf_normalization annotation field 9218 vaf_normalization_tag = "VAF" 9219 9220 # VCF infos tags 9221 vcf_infos_tags = { 9222 "VAF": "VAF Variant Frequency", 9223 } 9224 9225 # Prefix 9226 prefix = self.get_explode_infos_prefix() 9227 9228 # Variants table 9229 table_variants = self.get_table_variants() 9230 9231 # Header 9232 vcf_reader = self.get_header() 9233 9234 # Do not calculate if VAF already exists 9235 if "VAF" in vcf_reader.formats: 9236 log.debug("VAF already on genotypes") 9237 return 9238 9239 # Create variant id 9240 variant_id_column = self.get_variant_id_column() 9241 added_columns = [variant_id_column] 9242 9243 # variant_id, FORMAT and samples 9244 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9245 f""" "{sample}" """ for sample in self.get_header_sample_list() 9246 ) 9247 9248 # Create dataframe 9249 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9250 log.debug(f"query={query}") 9251 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9252 9253 vaf_normalization_set = [] 9254 9255 # for each sample vaf_normalization 9256 for sample in self.get_header_sample_list(): 9257 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9258 lambda row: vaf_normalization(row, sample=sample), axis=1 9259 ) 9260 vaf_normalization_set.append( 9261 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9262 ) 9263 9264 # Add VAF to FORMAT 9265 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9266 "FORMAT" 9267 ].apply(lambda x: str(x) + ":VAF") 9268 vaf_normalization_set.append( 9269 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9270 ) 9271 9272 # Add vaf_normalization to header 9273 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9274 id=vaf_normalization_tag, 9275 num="1", 9276 type="Float", 9277 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9278 type_code=self.code_type_map.get("Float"), 9279 ) 9280 9281 # Create fields to add in INFO 9282 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9283 9284 # Update 9285 sql_update = f""" 9286 UPDATE {table_variants} 9287 SET {sql_vaf_normalization_set} 9288 FROM dataframe_vaf_normalization 9289 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9290 9291 """ 9292 self.conn.execute(sql_update) 9293 9294 # Remove added columns 9295 for added_column in added_columns: 9296 self.drop_column(column=added_column) 9297 9298 # Delete dataframe 9299 del dataframe_vaf_normalization 9300 gc.collect() 9301 9302 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9303 """ 9304 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9305 field in a VCF file and updates the INFO column of the variants table with the calculated 9306 statistics. 9307 9308 :param info: The `info` parameter is a string that represents the type of information for which 9309 genotype statistics are calculated. It is used to generate various VCF info tags for the 9310 statistics, such as the number of occurrences, the list of values, the minimum value, the 9311 maximum value, the mean, the median, defaults to VAF 9312 :type info: str (optional) 9313 """ 9314 9315 # if FORMAT and samples 9316 if ( 9317 "FORMAT" in self.get_header_columns_as_list() 9318 and self.get_header_sample_list() 9319 ): 9320 9321 # vaf_stats annotation field 9322 vaf_stats_tag = info + "_stats" 9323 9324 # VCF infos tags 9325 vcf_infos_tags = { 9326 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9327 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9328 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9329 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9330 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9331 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9332 info 9333 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9334 } 9335 9336 # Prefix 9337 prefix = self.get_explode_infos_prefix() 9338 9339 # Field 9340 vaf_stats_infos = prefix + vaf_stats_tag 9341 9342 # Variants table 9343 table_variants = self.get_table_variants() 9344 9345 # Header 9346 vcf_reader = self.get_header() 9347 9348 # Create variant id 9349 variant_id_column = self.get_variant_id_column() 9350 added_columns = [variant_id_column] 9351 9352 # variant_id, FORMAT and samples 9353 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9354 self.get_header_sample_list() 9355 ) 9356 9357 # Create dataframe 9358 dataframe_vaf_stats = self.get_query_to_df( 9359 f""" SELECT {samples_fields} FROM {table_variants} """ 9360 ) 9361 9362 # Create vaf_stats column 9363 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9364 lambda row: genotype_stats( 9365 row, samples=self.get_header_sample_list(), info=info 9366 ), 9367 axis=1, 9368 ) 9369 9370 # List of vcf tags 9371 sql_vaf_stats_fields = [] 9372 9373 # Check all VAF stats infos 9374 for stat in vcf_infos_tags: 9375 9376 # Extract stats 9377 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9378 lambda x: dict(x).get(stat, "") 9379 ) 9380 9381 # Add snpeff_hgvs to header 9382 vcf_reader.infos[stat] = vcf.parser._Info( 9383 stat, 9384 ".", 9385 "String", 9386 vcf_infos_tags.get(stat, "genotype statistics"), 9387 "howard calculation", 9388 "0", 9389 self.code_type_map.get("String"), 9390 ) 9391 9392 if len(sql_vaf_stats_fields): 9393 sep = ";" 9394 else: 9395 sep = "" 9396 9397 # Create fields to add in INFO 9398 sql_vaf_stats_fields.append( 9399 f""" 9400 CASE 9401 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9402 THEN concat( 9403 '{sep}{stat}=', 9404 dataframe_vaf_stats."{stat}" 9405 ) 9406 ELSE '' 9407 END 9408 """ 9409 ) 9410 9411 # SQL set for update 9412 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9413 9414 # Update 9415 sql_update = f""" 9416 UPDATE {table_variants} 9417 SET "INFO" = 9418 concat( 9419 CASE 9420 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9421 THEN '' 9422 ELSE concat("INFO", ';') 9423 END, 9424 {sql_vaf_stats_fields_set} 9425 ) 9426 FROM dataframe_vaf_stats 9427 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9428 9429 """ 9430 self.conn.execute(sql_update) 9431 9432 # Remove added columns 9433 for added_column in added_columns: 9434 self.drop_column(column=added_column) 9435 9436 # Delete dataframe 9437 del dataframe_vaf_stats 9438 gc.collect() 9439 9440 def calculation_transcripts_annotation( 9441 self, info_json: str = None, info_format: str = None 9442 ) -> None: 9443 """ 9444 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9445 field to it if transcripts are available. 9446 9447 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9448 is a string parameter that represents the information field to be used in the transcripts JSON. 9449 It is used to specify the JSON format for the transcripts information. If no value is provided 9450 when calling the method, it defaults to " 9451 :type info_json: str 9452 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9453 method is a string parameter that specifies the format of the information field to be used in 9454 the transcripts JSON. It is used to define the format of the information field 9455 :type info_format: str 9456 """ 9457 9458 # Create transcripts table 9459 transcripts_table = self.create_transcript_view() 9460 9461 # Add info field 9462 if transcripts_table: 9463 self.transcript_view_to_variants( 9464 transcripts_table=transcripts_table, 9465 transcripts_info_field_json=info_json, 9466 transcripts_info_field_format=info_format, 9467 ) 9468 else: 9469 log.info("No Transcripts to process. Check param.json file configuration") 9470 9471 def calculation_transcripts_prioritization(self) -> None: 9472 """ 9473 The function `calculation_transcripts_prioritization` creates a transcripts table and 9474 prioritizes transcripts based on certain criteria. 9475 """ 9476 9477 # Create transcripts table 9478 transcripts_table = self.create_transcript_view() 9479 9480 # Add info field 9481 if transcripts_table: 9482 self.transcripts_prioritization(transcripts_table=transcripts_table) 9483 else: 9484 log.info("No Transcripts to process. Check param.json file configuration") 9485 9486 ############### 9487 # Transcripts # 9488 ############### 9489 9490 def transcripts_prioritization( 9491 self, transcripts_table: str = None, param: dict = {} 9492 ) -> bool: 9493 """ 9494 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9495 and updates the variants table with the prioritized information. 9496 9497 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9498 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9499 This parameter is used to identify the table where the transcripts data is stored for the 9500 prioritization process 9501 :type transcripts_table: str 9502 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9503 that contains various configuration settings for the prioritization process of transcripts. It 9504 is used to customize the behavior of the prioritization algorithm and includes settings such as 9505 the prefix for prioritization fields, default profiles, and other 9506 :type param: dict 9507 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9508 transcripts prioritization process is successfully completed, and `False` if there are any 9509 issues or if no profile is defined for transcripts prioritization. 9510 """ 9511 9512 log.debug("Start transcripts prioritization...") 9513 9514 # Param 9515 if not param: 9516 param = self.get_param() 9517 9518 # Variants table 9519 table_variants = self.get_table_variants() 9520 log.debug(f"transcripts_table={transcripts_table}") 9521 # Transcripts table 9522 if transcripts_table is None: 9523 log.debug(f"transcripts_table={transcripts_table}") 9524 transcripts_table = self.create_transcript_view( 9525 transcripts_table="transcripts", param=param 9526 ) 9527 log.debug(f"transcripts_table={transcripts_table}") 9528 if transcripts_table is None: 9529 msg_err = "No Transcripts table availalble" 9530 log.error(msg_err) 9531 raise ValueError(msg_err) 9532 9533 # Get transcripts columns 9534 columns_as_list_query = f""" 9535 DESCRIBE {transcripts_table} 9536 """ 9537 columns_as_list = list( 9538 self.get_query_to_df(columns_as_list_query)["column_name"] 9539 ) 9540 9541 # Create INFO if not exists 9542 if "INFO" not in columns_as_list: 9543 query_add_info = f""" 9544 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9545 """ 9546 self.execute_query(query_add_info) 9547 9548 # Prioritization param and Force only PZ Score and Flag 9549 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9550 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9551 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9552 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9553 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9554 pz_profile_default = ( 9555 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9556 ) 9557 9558 # Exit if no profile 9559 if pz_profile_default is None: 9560 log.warning("No profile defined for transcripts prioritization") 9561 return False 9562 9563 # Prioritization 9564 prioritization_result = self.prioritization( 9565 table=transcripts_table, 9566 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9567 ) 9568 if not prioritization_result: 9569 log.warning("Transcripts prioritization not processed") 9570 return False 9571 9572 # Explode PZ fields 9573 self.explode_infos( 9574 table=transcripts_table, 9575 fields=param.get("transcripts", {}) 9576 .get("prioritization", {}) 9577 .get("pzfields", []), 9578 ) 9579 9580 # Export Transcripts prioritization infos to variants table 9581 query_update = f""" 9582 WITH RankedTranscripts AS ( 9583 SELECT 9584 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9585 ROW_NUMBER() OVER ( 9586 PARTITION BY "#CHROM", POS, REF, ALT 9587 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9588 ) AS rn 9589 FROM 9590 {transcripts_table} 9591 ) 9592 UPDATE {table_variants} 9593 SET 9594 INFO = CONCAT(CASE 9595 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9596 THEN '' 9597 ELSE concat("INFO", ';') 9598 END, 9599 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9600 ) 9601 FROM 9602 RankedTranscripts 9603 WHERE 9604 rn = 1 9605 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9606 AND variants."POS" = RankedTranscripts."POS" 9607 AND variants."REF" = RankedTranscripts."REF" 9608 AND variants."ALT" = RankedTranscripts."ALT" 9609 9610 """ 9611 self.execute_query(query=query_update) 9612 9613 # Add PZ Transcript in header 9614 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9615 pz_fields_transcripts, 9616 ".", 9617 "String", 9618 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9619 "unknown", 9620 "unknown", 9621 code_type_map["String"], 9622 ) 9623 9624 # Return 9625 return True 9626 9627 def create_transcript_view_from_columns_map( 9628 self, 9629 transcripts_table: str = "transcripts", 9630 columns_maps: dict = {}, 9631 added_columns: list = [], 9632 temporary_tables: list = None, 9633 annotation_fields: list = None, 9634 ) -> tuple[list, list, list]: 9635 """ 9636 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9637 specified columns mapping for transcripts data. 9638 9639 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9640 the table where the transcripts data is stored or will be stored in the database. This table 9641 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9642 predictions, etc. It defaults to "transcripts, defaults to transcripts 9643 :type transcripts_table: str (optional) 9644 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9645 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9646 represents a mapping configuration for a specific set of columns. It typically includes details such 9647 as the main transcript column and additional information columns 9648 :type columns_maps: dict 9649 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9650 function is a list that stores the additional columns that will be added to the view being created 9651 based on the columns map provided. These columns are generated by exploding the transcript 9652 information columns along with the main transcript column 9653 :type added_columns: list 9654 :param temporary_tables: The `temporary_tables` parameter in the 9655 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9656 tables created during the process of creating a transcript view from a columns map. These temporary 9657 tables are used to store intermediate results or transformations before the final view is generated 9658 :type temporary_tables: list 9659 :param annotation_fields: The `annotation_fields` parameter in the 9660 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9661 for annotation in the query view creation process. These fields are extracted from the 9662 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9663 :type annotation_fields: list 9664 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9665 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9666 """ 9667 9668 log.debug("Start transcrpts view creation from columns map...") 9669 9670 # "from_columns_map": [ 9671 # { 9672 # "transcripts_column": "Ensembl_transcriptid", 9673 # "transcripts_infos_columns": [ 9674 # "genename", 9675 # "Ensembl_geneid", 9676 # "LIST_S2_score", 9677 # "LIST_S2_pred", 9678 # ], 9679 # }, 9680 # { 9681 # "transcripts_column": "Ensembl_transcriptid", 9682 # "transcripts_infos_columns": [ 9683 # "genename", 9684 # "VARITY_R_score", 9685 # "Aloft_pred", 9686 # ], 9687 # }, 9688 # ], 9689 9690 # Init 9691 if temporary_tables is None: 9692 temporary_tables = [] 9693 if annotation_fields is None: 9694 annotation_fields = [] 9695 9696 # Variants table 9697 table_variants = self.get_table_variants() 9698 9699 for columns_map in columns_maps: 9700 9701 # Transcript column 9702 transcripts_column = columns_map.get("transcripts_column", None) 9703 9704 # Transcripts infos columns 9705 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9706 9707 if transcripts_column is not None: 9708 9709 # Explode 9710 added_columns += self.explode_infos( 9711 fields=[transcripts_column] + transcripts_infos_columns 9712 ) 9713 9714 # View clauses 9715 clause_select = [] 9716 for field in [transcripts_column] + transcripts_infos_columns: 9717 clause_select.append( 9718 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9719 ) 9720 if field not in [transcripts_column]: 9721 annotation_fields.append(field) 9722 9723 # Querey View 9724 query = f""" 9725 SELECT 9726 "#CHROM", POS, REF, ALT, INFO, 9727 "{transcripts_column}" AS 'transcript', 9728 {", ".join(clause_select)} 9729 FROM ( 9730 SELECT 9731 "#CHROM", POS, REF, ALT, INFO, 9732 {", ".join(clause_select)} 9733 FROM {table_variants} 9734 ) 9735 WHERE "{transcripts_column}" IS NOT NULL 9736 """ 9737 9738 # Create temporary table 9739 temporary_table = transcripts_table + "".join( 9740 random.choices(string.ascii_uppercase + string.digits, k=10) 9741 ) 9742 9743 # Temporary_tables 9744 temporary_tables.append(temporary_table) 9745 query_view = f""" 9746 CREATE TEMPORARY TABLE {temporary_table} 9747 AS ({query}) 9748 """ 9749 self.execute_query(query=query_view) 9750 9751 return added_columns, temporary_tables, annotation_fields 9752 9753 def create_transcript_view_from_column_format( 9754 self, 9755 transcripts_table: str = "transcripts", 9756 column_formats: dict = {}, 9757 temporary_tables: list = None, 9758 annotation_fields: list = None, 9759 ) -> tuple[list, list, list]: 9760 """ 9761 The `create_transcript_view_from_column_format` function generates a transcript view based on 9762 specified column formats, adds additional columns and annotation fields, and returns the list of 9763 temporary tables and annotation fields. 9764 9765 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9766 the table containing the transcripts data. This table will be used as the base table for creating 9767 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9768 different table name if needed, defaults to transcripts 9769 :type transcripts_table: str (optional) 9770 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9771 about the columns to be used for creating the transcript view. Each entry in the dictionary 9772 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9773 the provided code snippet: 9774 :type column_formats: dict 9775 :param temporary_tables: The `temporary_tables` parameter in the 9776 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9777 views created during the process of creating a transcript view from a column format. These temporary 9778 views are used to manipulate and extract data before generating the final transcript view. It 9779 :type temporary_tables: list 9780 :param annotation_fields: The `annotation_fields` parameter in the 9781 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9782 that are extracted from the temporary views created during the process. These annotation fields are 9783 obtained by querying the temporary views and extracting the column names excluding specific columns 9784 like `#CH 9785 :type annotation_fields: list 9786 :return: The `create_transcript_view_from_column_format` function returns two lists: 9787 `temporary_tables` and `annotation_fields`. 9788 """ 9789 9790 log.debug("Start transcrpts view creation from column format...") 9791 9792 # "from_column_format": [ 9793 # { 9794 # "transcripts_column": "ANN", 9795 # "transcripts_infos_column": "Feature_ID", 9796 # } 9797 # ], 9798 9799 # Init 9800 if temporary_tables is None: 9801 temporary_tables = [] 9802 if annotation_fields is None: 9803 annotation_fields = [] 9804 9805 for column_format in column_formats: 9806 9807 # annotation field and transcript annotation field 9808 annotation_field = column_format.get("transcripts_column", "ANN") 9809 transcript_annotation = column_format.get( 9810 "transcripts_infos_column", "Feature_ID" 9811 ) 9812 9813 # Temporary View name 9814 temporary_view_name = transcripts_table + "".join( 9815 random.choices(string.ascii_uppercase + string.digits, k=10) 9816 ) 9817 9818 # Create temporary view name 9819 temporary_view_name = self.annotation_format_to_table( 9820 uniquify=True, 9821 annotation_field=annotation_field, 9822 view_name=temporary_view_name, 9823 annotation_id=transcript_annotation, 9824 ) 9825 9826 # Annotation fields 9827 if temporary_view_name: 9828 query_annotation_fields = f""" 9829 SELECT * 9830 FROM ( 9831 DESCRIBE SELECT * 9832 FROM {temporary_view_name} 9833 ) 9834 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9835 """ 9836 df_annotation_fields = self.get_query_to_df( 9837 query=query_annotation_fields 9838 ) 9839 9840 # Add temporary view and annotation fields 9841 temporary_tables.append(temporary_view_name) 9842 annotation_fields += list(set(df_annotation_fields["column_name"])) 9843 9844 return temporary_tables, annotation_fields 9845 9846 def create_transcript_view( 9847 self, 9848 transcripts_table: str = None, 9849 transcripts_table_drop: bool = True, 9850 param: dict = {}, 9851 ) -> str: 9852 """ 9853 The `create_transcript_view` function generates a transcript view by processing data from a 9854 specified table based on provided parameters and structural information. 9855 9856 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9857 is used to specify the name of the table that will store the final transcript view data. If a table 9858 name is not provided, the function will create a new table to store the transcript view data, and by 9859 default,, defaults to transcripts 9860 :type transcripts_table: str (optional) 9861 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9862 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9863 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9864 the function will drop the existing transcripts table if it exists, defaults to True 9865 :type transcripts_table_drop: bool (optional) 9866 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9867 contains information needed to create a transcript view. It includes details such as the structure 9868 of the transcripts, columns mapping, column formats, and other necessary information for generating 9869 the view. This parameter allows for flexibility and customization 9870 :type param: dict 9871 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9872 created or modified during the execution of the function. 9873 """ 9874 9875 log.debug("Start transcripts view creation...") 9876 9877 # Default 9878 transcripts_table_default = "transcripts" 9879 9880 # Param 9881 if not param: 9882 param = self.get_param() 9883 9884 # Struct 9885 struct = param.get("transcripts", {}).get("struct", None) 9886 9887 if struct: 9888 9889 # Transcripts table 9890 if transcripts_table is None: 9891 transcripts_table = param.get("transcripts", {}).get( 9892 "table", transcripts_table_default 9893 ) 9894 9895 # added_columns 9896 added_columns = [] 9897 9898 # Temporary tables 9899 temporary_tables = [] 9900 9901 # Annotation fields 9902 annotation_fields = [] 9903 9904 # from columns map 9905 columns_maps = struct.get("from_columns_map", []) 9906 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9907 self.create_transcript_view_from_columns_map( 9908 transcripts_table=transcripts_table, 9909 columns_maps=columns_maps, 9910 added_columns=added_columns, 9911 temporary_tables=temporary_tables, 9912 annotation_fields=annotation_fields, 9913 ) 9914 ) 9915 added_columns += added_columns_tmp 9916 temporary_tables += temporary_tables_tmp 9917 annotation_fields += annotation_fields_tmp 9918 9919 # from column format 9920 column_formats = struct.get("from_column_format", []) 9921 temporary_tables_tmp, annotation_fields_tmp = ( 9922 self.create_transcript_view_from_column_format( 9923 transcripts_table=transcripts_table, 9924 column_formats=column_formats, 9925 temporary_tables=temporary_tables, 9926 annotation_fields=annotation_fields, 9927 ) 9928 ) 9929 temporary_tables += temporary_tables_tmp 9930 annotation_fields += annotation_fields_tmp 9931 9932 # Merge temporary tables query 9933 query_merge = "" 9934 for temporary_table in temporary_tables: 9935 9936 # First temporary table 9937 if not query_merge: 9938 query_merge = f""" 9939 SELECT * FROM {temporary_table} 9940 """ 9941 # other temporary table (using UNION) 9942 else: 9943 query_merge += f""" 9944 UNION BY NAME SELECT * FROM {temporary_table} 9945 """ 9946 9947 # Merge on transcript 9948 query_merge_on_transcripts_annotation_fields = [] 9949 # Aggregate all annotations fields 9950 for annotation_field in set(annotation_fields): 9951 query_merge_on_transcripts_annotation_fields.append( 9952 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9953 ) 9954 # Query for transcripts view 9955 query_merge_on_transcripts = f""" 9956 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9957 FROM ({query_merge}) 9958 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 9959 """ 9960 9961 # Drop transcript view is necessary 9962 if transcripts_table_drop: 9963 query_drop = f""" 9964 DROP TABLE IF EXISTS {transcripts_table}; 9965 """ 9966 self.execute_query(query=query_drop) 9967 9968 # Merge and create transcript view 9969 query_create_view = f""" 9970 CREATE TABLE IF NOT EXISTS {transcripts_table} 9971 AS {query_merge_on_transcripts} 9972 """ 9973 self.execute_query(query=query_create_view) 9974 9975 # Remove added columns 9976 for added_column in added_columns: 9977 self.drop_column(column=added_column) 9978 9979 else: 9980 9981 transcripts_table = None 9982 9983 return transcripts_table 9984 9985 def annotation_format_to_table( 9986 self, 9987 uniquify: bool = True, 9988 annotation_field: str = "ANN", 9989 annotation_id: str = "Feature_ID", 9990 view_name: str = "transcripts", 9991 ) -> str: 9992 """ 9993 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9994 table format. 9995 9996 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9997 values in the output or not. If set to `True`, the function will make sure that the output values 9998 are unique, defaults to True 9999 :type uniquify: bool (optional) 10000 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10001 contains the annotation information for each variant. This field is used to extract the annotation 10002 details for further processing in the function, defaults to ANN 10003 :type annotation_field: str (optional) 10004 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10005 used to specify the identifier for the annotation feature. This identifier will be used as a column 10006 name in the resulting table or view that is created based on the annotation data. It helps in 10007 uniquely identifying each annotation entry in the, defaults to Feature_ID 10008 :type annotation_id: str (optional) 10009 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10010 specify the name of the temporary table that will be created to store the transformed annotation 10011 data. This table will hold the extracted information from the annotation field in a structured 10012 format for further processing or analysis, defaults to transcripts 10013 :type view_name: str (optional) 10014 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10015 is stored in the variable `view_name`. 10016 """ 10017 10018 # Annotation field 10019 annotation_format = "annotation_explode" 10020 10021 # Transcript annotation 10022 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10023 10024 # Prefix 10025 prefix = self.get_explode_infos_prefix() 10026 if prefix: 10027 prefix = "INFO/" 10028 10029 # Annotation fields 10030 annotation_infos = prefix + annotation_field 10031 annotation_format_infos = prefix + annotation_format 10032 10033 # Variants table 10034 table_variants = self.get_table_variants() 10035 10036 # Header 10037 vcf_reader = self.get_header() 10038 10039 # Add columns 10040 added_columns = [] 10041 10042 # Explode HGVS field in column 10043 added_columns += self.explode_infos(fields=[annotation_field]) 10044 10045 if annotation_field in vcf_reader.infos: 10046 10047 # Extract ANN header 10048 ann_description = vcf_reader.infos[annotation_field].desc 10049 pattern = r"'(.+?)'" 10050 match = re.search(pattern, ann_description) 10051 if match: 10052 ann_header_match = match.group(1).split(" | ") 10053 ann_header = [] 10054 ann_header_desc = {} 10055 for i in range(len(ann_header_match)): 10056 ann_header_info = "".join( 10057 char for char in ann_header_match[i] if char.isalnum() 10058 ) 10059 ann_header.append(ann_header_info) 10060 ann_header_desc[ann_header_info] = ann_header_match[i] 10061 if not ann_header_desc: 10062 raise ValueError("Invalid header description format") 10063 else: 10064 raise ValueError("Invalid header description format") 10065 10066 # Create variant id 10067 variant_id_column = self.get_variant_id_column() 10068 added_columns += [variant_id_column] 10069 10070 # Create dataframe 10071 dataframe_annotation_format = self.get_query_to_df( 10072 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10073 ) 10074 10075 # Create annotation columns 10076 dataframe_annotation_format[ 10077 annotation_format_infos 10078 ] = dataframe_annotation_format[annotation_infos].apply( 10079 lambda x: explode_annotation_format( 10080 annotation=str(x), 10081 uniquify=uniquify, 10082 output_format="JSON", 10083 prefix="", 10084 header=list(ann_header_desc.values()), 10085 ) 10086 ) 10087 10088 # Find keys 10089 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10090 df_keys = self.get_query_to_df(query=query_json) 10091 10092 # Check keys 10093 query_json_key = [] 10094 for _, row in df_keys.iterrows(): 10095 10096 # Key 10097 key = row.iloc[0] 10098 10099 # key_clean 10100 key_clean = "".join(char for char in key if char.isalnum()) 10101 10102 # Type 10103 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10104 10105 # Get DataFrame from query 10106 df_json_type = self.get_query_to_df(query=query_json_type) 10107 10108 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10109 with pd.option_context("future.no_silent_downcasting", True): 10110 df_json_type.fillna(value="", inplace=True) 10111 replace_dict = {None: np.nan, "": np.nan} 10112 df_json_type.replace(replace_dict, inplace=True) 10113 df_json_type.dropna(inplace=True) 10114 10115 # Detect column type 10116 column_type = detect_column_type(df_json_type[key_clean]) 10117 10118 # Append 10119 query_json_key.append( 10120 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10121 ) 10122 10123 # Create view 10124 query_view = f""" 10125 CREATE TEMPORARY TABLE {view_name} 10126 AS ( 10127 SELECT *, {annotation_id} AS 'transcript' 10128 FROM ( 10129 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10130 FROM dataframe_annotation_format 10131 ) 10132 ); 10133 """ 10134 self.execute_query(query=query_view) 10135 10136 else: 10137 10138 # Return None 10139 view_name = None 10140 10141 # Remove added columns 10142 for added_column in added_columns: 10143 self.drop_column(column=added_column) 10144 10145 return view_name 10146 10147 def transcript_view_to_variants( 10148 self, 10149 transcripts_table: str = None, 10150 transcripts_column_id: str = None, 10151 transcripts_info_json: str = None, 10152 transcripts_info_field_json: str = None, 10153 transcripts_info_format: str = None, 10154 transcripts_info_field_format: str = None, 10155 param: dict = {}, 10156 ) -> bool: 10157 """ 10158 The `transcript_view_to_variants` function updates a variants table with information from 10159 transcripts in JSON format. 10160 10161 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10162 table containing the transcripts data. If this parameter is not provided, the function will 10163 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10164 :type transcripts_table: str 10165 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10166 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10167 identifier is used to match transcripts with variants in the database 10168 :type transcripts_column_id: str 10169 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10170 of the column in the variants table where the transcripts information will be stored in JSON 10171 format. This parameter allows you to define the column in the variants table that will hold the 10172 JSON-formatted information about transcripts 10173 :type transcripts_info_json: str 10174 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10175 specify the field in the VCF header that will contain information about transcripts in JSON 10176 format. This field will be added to the VCF header as an INFO field with the specified name 10177 :type transcripts_info_field_json: str 10178 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10179 format of the information about transcripts that will be stored in the variants table. This 10180 format can be used to define how the transcript information will be structured or displayed 10181 within the variants table 10182 :type transcripts_info_format: str 10183 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10184 specify the field in the VCF header that will contain information about transcripts in a 10185 specific format. This field will be added to the VCF header as an INFO field with the specified 10186 name 10187 :type transcripts_info_field_format: str 10188 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10189 that contains various configuration settings related to transcripts. It is used to provide 10190 default values for certain parameters if they are not explicitly provided when calling the 10191 method. The `param` dictionary can be passed as an argument 10192 :type param: dict 10193 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10194 if the operation is successful and `False` if certain conditions are not met. 10195 """ 10196 10197 msg_info_prefix = "Start transcripts view to variants annotations" 10198 10199 log.debug(f"{msg_info_prefix}...") 10200 10201 # Default 10202 transcripts_table_default = "transcripts" 10203 transcripts_column_id_default = "transcript" 10204 transcripts_info_json_default = None 10205 transcripts_info_format_default = None 10206 transcripts_info_field_json_default = None 10207 transcripts_info_field_format_default = None 10208 10209 # Param 10210 if not param: 10211 param = self.get_param() 10212 10213 # Transcripts table 10214 if transcripts_table is None: 10215 transcripts_table = param.get("transcripts", {}).get( 10216 "table", transcripts_table_default 10217 ) 10218 10219 # Transcripts column ID 10220 if transcripts_column_id is None: 10221 transcripts_column_id = param.get("transcripts", {}).get( 10222 "column_id", transcripts_column_id_default 10223 ) 10224 10225 # Transcripts info json 10226 if transcripts_info_json is None: 10227 transcripts_info_json = param.get("transcripts", {}).get( 10228 "transcripts_info_json", transcripts_info_json_default 10229 ) 10230 10231 # Transcripts info field JSON 10232 if transcripts_info_field_json is None: 10233 transcripts_info_field_json = param.get("transcripts", {}).get( 10234 "transcripts_info_field_json", transcripts_info_field_json_default 10235 ) 10236 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10237 # transcripts_info_json = transcripts_info_field_json 10238 10239 # Transcripts info format 10240 if transcripts_info_format is None: 10241 transcripts_info_format = param.get("transcripts", {}).get( 10242 "transcripts_info_format", transcripts_info_format_default 10243 ) 10244 10245 # Transcripts info field FORMAT 10246 if transcripts_info_field_format is None: 10247 transcripts_info_field_format = param.get("transcripts", {}).get( 10248 "transcripts_info_field_format", transcripts_info_field_format_default 10249 ) 10250 # if ( 10251 # transcripts_info_field_format is not None 10252 # and transcripts_info_format is None 10253 # ): 10254 # transcripts_info_format = transcripts_info_field_format 10255 10256 # Variants table 10257 table_variants = self.get_table_variants() 10258 10259 # Check info columns param 10260 if ( 10261 transcripts_info_json is None 10262 and transcripts_info_field_json is None 10263 and transcripts_info_format is None 10264 and transcripts_info_field_format is None 10265 ): 10266 return False 10267 10268 # Transcripts infos columns 10269 query_transcripts_infos_columns = f""" 10270 SELECT * 10271 FROM ( 10272 DESCRIBE SELECT * FROM {transcripts_table} 10273 ) 10274 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10275 """ 10276 transcripts_infos_columns = list( 10277 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10278 ) 10279 10280 # View results 10281 clause_select = [] 10282 clause_to_json = [] 10283 clause_to_format = [] 10284 for field in transcripts_infos_columns: 10285 clause_select.append( 10286 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10287 ) 10288 clause_to_json.append(f""" '{field}': "{field}" """) 10289 clause_to_format.append(f""" "{field}" """) 10290 10291 # Update 10292 update_set_json = [] 10293 update_set_format = [] 10294 10295 # VCF header 10296 vcf_reader = self.get_header() 10297 10298 # Transcripts to info column in JSON 10299 if transcripts_info_json is not None: 10300 10301 # Create column on variants table 10302 self.add_column( 10303 table_name=table_variants, 10304 column_name=transcripts_info_json, 10305 column_type="JSON", 10306 default_value=None, 10307 drop=False, 10308 ) 10309 10310 # Add header 10311 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10312 transcripts_info_json, 10313 ".", 10314 "String", 10315 "Transcripts in JSON format", 10316 "unknwon", 10317 "unknwon", 10318 self.code_type_map["String"], 10319 ) 10320 10321 # Add to update 10322 update_set_json.append( 10323 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10324 ) 10325 10326 # Transcripts to info field in JSON 10327 if transcripts_info_field_json is not None: 10328 10329 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10330 10331 # Add to update 10332 update_set_json.append( 10333 f""" 10334 INFO = concat( 10335 CASE 10336 WHEN INFO NOT IN ('', '.') 10337 THEN INFO 10338 ELSE '' 10339 END, 10340 CASE 10341 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10342 THEN concat( 10343 ';{transcripts_info_field_json}=', 10344 t.{transcripts_info_json} 10345 ) 10346 ELSE '' 10347 END 10348 ) 10349 """ 10350 ) 10351 10352 # Add header 10353 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10354 transcripts_info_field_json, 10355 ".", 10356 "String", 10357 "Transcripts in JSON format", 10358 "unknwon", 10359 "unknwon", 10360 self.code_type_map["String"], 10361 ) 10362 10363 if update_set_json: 10364 10365 # Update query 10366 query_update = f""" 10367 UPDATE {table_variants} 10368 SET {", ".join(update_set_json)} 10369 FROM 10370 ( 10371 SELECT 10372 "#CHROM", POS, REF, ALT, 10373 concat( 10374 '{{', 10375 string_agg( 10376 '"' || "{transcripts_column_id}" || '":' || 10377 to_json(json_output) 10378 ), 10379 '}}' 10380 )::JSON AS {transcripts_info_json} 10381 FROM 10382 ( 10383 SELECT 10384 "#CHROM", POS, REF, ALT, 10385 "{transcripts_column_id}", 10386 to_json( 10387 {{{",".join(clause_to_json)}}} 10388 )::JSON AS json_output 10389 FROM 10390 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10391 WHERE "{transcripts_column_id}" IS NOT NULL 10392 ) 10393 GROUP BY "#CHROM", POS, REF, ALT 10394 ) AS t 10395 WHERE {table_variants}."#CHROM" = t."#CHROM" 10396 AND {table_variants}."POS" = t."POS" 10397 AND {table_variants}."REF" = t."REF" 10398 AND {table_variants}."ALT" = t."ALT" 10399 """ 10400 10401 self.execute_query(query=query_update) 10402 10403 # Transcripts to info column in FORMAT 10404 if transcripts_info_format is not None: 10405 10406 # Create column on variants table 10407 self.add_column( 10408 table_name=table_variants, 10409 column_name=transcripts_info_format, 10410 column_type="VARCHAR", 10411 default_value=None, 10412 drop=False, 10413 ) 10414 10415 # Add header 10416 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10417 transcripts_info_format, 10418 ".", 10419 "String", 10420 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10421 "unknwon", 10422 "unknwon", 10423 self.code_type_map["String"], 10424 ) 10425 10426 # Add to update 10427 update_set_format.append( 10428 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10429 ) 10430 10431 # Transcripts to info field in JSON 10432 if transcripts_info_field_format is not None: 10433 10434 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10435 10436 # Add to update 10437 update_set_format.append( 10438 f""" 10439 INFO = concat( 10440 CASE 10441 WHEN INFO NOT IN ('', '.') 10442 THEN INFO 10443 ELSE '' 10444 END, 10445 CASE 10446 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10447 THEN concat( 10448 ';{transcripts_info_field_format}=', 10449 t.{transcripts_info_format} 10450 ) 10451 ELSE '' 10452 END 10453 ) 10454 """ 10455 ) 10456 10457 # Add header 10458 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10459 transcripts_info_field_format, 10460 ".", 10461 "String", 10462 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10463 "unknwon", 10464 "unknwon", 10465 self.code_type_map["String"], 10466 ) 10467 10468 if update_set_format: 10469 10470 # Update query 10471 query_update = f""" 10472 UPDATE {table_variants} 10473 SET {", ".join(update_set_format)} 10474 FROM 10475 ( 10476 SELECT 10477 "#CHROM", POS, REF, ALT, 10478 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10479 FROM 10480 ( 10481 SELECT 10482 "#CHROM", POS, REF, ALT, 10483 "{transcripts_column_id}", 10484 concat( 10485 "{transcripts_column_id}", 10486 '|', 10487 {", '|', ".join(clause_to_format)} 10488 ) AS {transcripts_info_format} 10489 FROM 10490 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10491 ) 10492 GROUP BY "#CHROM", POS, REF, ALT 10493 ) AS t 10494 WHERE {table_variants}."#CHROM" = t."#CHROM" 10495 AND {table_variants}."POS" = t."POS" 10496 AND {table_variants}."REF" = t."REF" 10497 AND {table_variants}."ALT" = t."ALT" 10498 """ 10499 10500 self.execute_query(query=query_update) 10501 10502 return True
36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Samples 78 self.set_samples() 79 80 # Load data 81 if load: 82 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
84 def set_samples(self, samples: list = None) -> list: 85 """ 86 The function `set_samples` sets the samples attribute of an object to a provided list or 87 retrieves it from a parameter dictionary. 88 89 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 90 input and sets the `samples` attribute of the class to the provided list. If no samples are 91 provided, it tries to get the samples from the class's parameters using the `get_param` method 92 :type samples: list 93 :return: The `samples` list is being returned. 94 """ 95 96 if not samples: 97 samples = self.get_param().get("samples", {}).get("list", None) 98 99 self.samples = samples 100 101 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
103 def get_samples(self) -> list: 104 """ 105 This function returns a list of samples. 106 :return: The `get_samples` method is returning the `samples` attribute of the object. 107 """ 108 109 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
111 def get_samples_check(self) -> bool: 112 """ 113 This function returns the value of the "check" key within the "samples" dictionary retrieved 114 from the parameters. 115 :return: The method `get_samples_check` is returning the value of the key "check" inside the 116 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 117 method. If the key "check" is not found, it will return `False`. 118 """ 119 120 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
122 def set_input(self, input: str = None) -> None: 123 """ 124 The function `set_input` takes a file name as input, extracts the name and extension, and sets 125 attributes in the class accordingly. 126 127 :param input: The `set_input` method in the provided code snippet is used to set attributes 128 related to the input file. Here's a breakdown of the parameters and their usage in the method: 129 :type input: str 130 """ 131 132 if input and not isinstance(input, str): 133 try: 134 self.input = input.name 135 except: 136 log.error(f"Input file '{input} in bad format") 137 raise ValueError(f"Input file '{input} in bad format") 138 else: 139 self.input = input 140 141 # Input format 142 if input: 143 input_name, input_extension = os.path.splitext(self.input) 144 self.input_name = input_name 145 self.input_extension = input_extension 146 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
148 def set_config(self, config: dict) -> None: 149 """ 150 The set_config function takes a config object and assigns it as the configuration object for the 151 class. 152 153 :param config: The `config` parameter in the `set_config` function is a dictionary object that 154 contains configuration settings for the class. When you call the `set_config` function with a 155 dictionary object as the argument, it will set that dictionary as the configuration object for 156 the class 157 :type config: dict 158 """ 159 160 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
162 def set_param(self, param: dict) -> None: 163 """ 164 This function sets a parameter object for the class based on the input dictionary. 165 166 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 167 as the `param` attribute of the class instance 168 :type param: dict 169 """ 170 171 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
173 def init_variables(self) -> None: 174 """ 175 This function initializes the variables that will be used in the rest of the class 176 """ 177 178 self.prefix = "howard" 179 self.table_variants = "variants" 180 self.dataframe = None 181 182 self.comparison_map = { 183 "gt": ">", 184 "gte": ">=", 185 "lt": "<", 186 "lte": "<=", 187 "equals": "=", 188 "contains": "SIMILAR TO", 189 } 190 191 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 192 193 self.code_type_map_to_sql = { 194 "Integer": "INTEGER", 195 "String": "VARCHAR", 196 "Float": "FLOAT", 197 "Flag": "VARCHAR", 198 } 199 200 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
202 def get_indexing(self) -> bool: 203 """ 204 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 205 returns False. 206 :return: The value of the indexing parameter. 207 """ 208 209 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
211 def get_connexion_config(self) -> dict: 212 """ 213 The function `get_connexion_config` returns a dictionary containing the configuration for a 214 connection, including the number of threads and memory limit. 215 :return: a dictionary containing the configuration for the Connexion library. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # Connexion config 222 connexion_config = {} 223 threads = self.get_threads() 224 225 # Threads 226 if threads: 227 connexion_config["threads"] = threads 228 229 # Memory 230 # if config.get("memory", None): 231 # connexion_config["memory_limit"] = config.get("memory") 232 if self.get_memory(): 233 connexion_config["memory_limit"] = self.get_memory() 234 235 # Temporary directory 236 if config.get("tmp", None): 237 connexion_config["temp_directory"] = config.get("tmp") 238 239 # Access 240 if config.get("access", None): 241 access = config.get("access") 242 if access in ["RO"]: 243 access = "READ_ONLY" 244 elif access in ["RW"]: 245 access = "READ_WRITE" 246 connexion_db = self.get_connexion_db() 247 if connexion_db in ":memory:": 248 access = "READ_WRITE" 249 connexion_config["access_mode"] = access 250 251 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
253 def get_duckdb_settings(self) -> dict: 254 """ 255 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 256 string. 257 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 258 """ 259 260 # config 261 config = self.get_config() 262 263 # duckdb settings 264 duckdb_settings_dict = {} 265 if config.get("duckdb_settings", None): 266 duckdb_settings = config.get("duckdb_settings") 267 duckdb_settings = full_path(duckdb_settings) 268 # duckdb setting is a file 269 if os.path.exists(duckdb_settings): 270 with open(duckdb_settings) as json_file: 271 duckdb_settings_dict = yaml.safe_load(json_file) 272 # duckdb settings is a string 273 else: 274 duckdb_settings_dict = json.loads(duckdb_settings) 275 276 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
278 def set_connexion_db(self) -> str: 279 """ 280 The function `set_connexion_db` returns the appropriate database connection string based on the 281 input format and connection type. 282 :return: the value of the variable `connexion_db`. 283 """ 284 285 # Default connexion db 286 default_connexion_db = ":memory:" 287 288 # Find connexion db 289 if self.get_input_format() in ["db", "duckdb"]: 290 connexion_db = self.get_input() 291 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 292 connexion_db = default_connexion_db 293 elif self.get_connexion_type() in ["tmpfile"]: 294 tmp_name = tempfile.mkdtemp( 295 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 296 ) 297 connexion_db = f"{tmp_name}/tmp.db" 298 elif self.get_connexion_type() != "": 299 connexion_db = self.get_connexion_type() 300 else: 301 connexion_db = default_connexion_db 302 303 # Set connexion db 304 self.connexion_db = connexion_db 305 306 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
308 def set_connexion(self, conn) -> None: 309 """ 310 The function `set_connexion` creates a connection to a database, with options for different 311 database formats and settings. 312 313 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 314 database. If a connection is not provided, a new connection to an in-memory database is created. 315 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 316 sqlite 317 """ 318 319 # Connexion db 320 connexion_db = self.set_connexion_db() 321 322 # Connexion config 323 connexion_config = self.get_connexion_config() 324 325 # Connexion format 326 connexion_format = self.get_config().get("connexion_format", "duckdb") 327 # Set connexion format 328 self.connexion_format = connexion_format 329 330 # Connexion 331 if not conn: 332 if connexion_format in ["duckdb"]: 333 conn = duckdb.connect(connexion_db, config=connexion_config) 334 # duckDB settings 335 duckdb_settings = self.get_duckdb_settings() 336 if duckdb_settings: 337 for setting in duckdb_settings: 338 setting_value = duckdb_settings.get(setting) 339 if isinstance(setting_value, str): 340 setting_value = f"'{setting_value}'" 341 conn.execute(f"PRAGMA {setting}={setting_value};") 342 elif connexion_format in ["sqlite"]: 343 conn = sqlite3.connect(connexion_db) 344 345 # Set connexion 346 self.conn = conn 347 348 # Log 349 log.debug(f"connexion_format: {connexion_format}") 350 log.debug(f"connexion_db: {connexion_db}") 351 log.debug(f"connexion config: {connexion_config}") 352 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
354 def set_output(self, output: str = None) -> None: 355 """ 356 The `set_output` function in Python sets the output file based on the input or a specified key 357 in the config file, extracting the output name, extension, and format. 358 359 :param output: The `output` parameter in the `set_output` method is used to specify the name of 360 the output file. If the config file has an 'output' key, the method sets the output to the value 361 of that key. If no output is provided, it sets the output to `None` 362 :type output: str 363 """ 364 365 if output and not isinstance(output, str): 366 self.output = output.name 367 else: 368 self.output = output 369 370 # Output format 371 if self.output: 372 output_name, output_extension = os.path.splitext(self.output) 373 self.output_name = output_name 374 self.output_extension = output_extension 375 self.output_format = self.output_extension.replace(".", "") 376 else: 377 self.output_name = None 378 self.output_extension = None 379 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
381 def set_header(self) -> None: 382 """ 383 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 384 """ 385 386 input_file = self.get_input() 387 default_header_list = [ 388 "##fileformat=VCFv4.2", 389 "#CHROM POS ID REF ALT QUAL FILTER INFO", 390 ] 391 392 # Full path 393 input_file = full_path(input_file) 394 395 if input_file: 396 397 input_format = self.get_input_format() 398 input_compressed = self.get_input_compressed() 399 config = self.get_config() 400 header_list = default_header_list 401 if input_format in [ 402 "vcf", 403 "hdr", 404 "tsv", 405 "csv", 406 "psv", 407 "parquet", 408 "db", 409 "duckdb", 410 ]: 411 # header provided in param 412 if config.get("header_file", None): 413 with open(config.get("header_file"), "rt") as f: 414 header_list = self.read_vcf_header(f) 415 # within a vcf file format (header within input file itsself) 416 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 417 # within a compressed vcf file format (.vcf.gz) 418 if input_compressed: 419 with bgzf.open(input_file, "rt") as f: 420 header_list = self.read_vcf_header(f) 421 # within an uncompressed vcf file format (.vcf) 422 else: 423 with open(input_file, "rt") as f: 424 header_list = self.read_vcf_header(f) 425 # header provided in default external file .hdr 426 elif os.path.exists((input_file + ".hdr")): 427 with open(input_file + ".hdr", "rt") as f: 428 header_list = self.read_vcf_header(f) 429 else: 430 try: # Try to get header info fields and file columns 431 432 with tempfile.TemporaryDirectory() as tmpdir: 433 434 # Create database 435 db_for_header = Database(database=input_file) 436 437 # Get header columns for infos fields 438 db_header_from_columns = ( 439 db_for_header.get_header_from_columns() 440 ) 441 442 # Get real columns in the file 443 db_header_columns = db_for_header.get_columns() 444 445 # Write header file 446 header_file_tmp = os.path.join(tmpdir, "header") 447 f = open(header_file_tmp, "w") 448 vcf.Writer(f, db_header_from_columns) 449 f.close() 450 451 # Replace #CHROM line with rel columns 452 header_list = db_for_header.read_header_file( 453 header_file=header_file_tmp 454 ) 455 header_list[-1] = "\t".join(db_header_columns) 456 457 except: 458 459 log.warning( 460 f"No header for file {input_file}. Set as default VCF header" 461 ) 462 header_list = default_header_list 463 464 else: # try for unknown format ? 465 466 log.error(f"Input file format '{input_format}' not available") 467 raise ValueError(f"Input file format '{input_format}' not available") 468 469 if not header_list: 470 header_list = default_header_list 471 472 # header as list 473 self.header_list = header_list 474 475 # header as VCF object 476 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 477 478 else: 479 480 self.header_list = None 481 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
483 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 484 """ 485 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 486 DataFrame based on the connection format. 487 488 :param query: The `query` parameter in the `get_query_to_df` function is a string that 489 represents the SQL query you want to execute. This query will be used to fetch data from a 490 database and convert it into a pandas DataFrame 491 :type query: str 492 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 493 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 494 function will only fetch up to that number of rows from the database query result. If no limit 495 is specified, 496 :type limit: int 497 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 498 """ 499 500 # Connexion format 501 connexion_format = self.get_connexion_format() 502 503 # Limit in query 504 if limit: 505 pd.set_option("display.max_rows", limit) 506 if connexion_format in ["duckdb"]: 507 df = ( 508 self.conn.execute(query) 509 .fetch_record_batch(limit) 510 .read_next_batch() 511 .to_pandas() 512 ) 513 elif connexion_format in ["sqlite"]: 514 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 515 516 # Full query 517 else: 518 if connexion_format in ["duckdb"]: 519 df = self.conn.execute(query).df() 520 elif connexion_format in ["sqlite"]: 521 df = pd.read_sql_query(query, self.conn) 522 523 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
525 def get_overview(self) -> None: 526 """ 527 The function prints the input, output, config, and dataframe of the current object 528 """ 529 table_variants_from = self.get_table_variants(clause="from") 530 sql_columns = self.get_header_columns_as_sql() 531 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 532 df = self.get_query_to_df(sql_query_export) 533 log.info( 534 "Input: " 535 + str(self.get_input()) 536 + " [" 537 + str(str(self.get_input_format())) 538 + "]" 539 ) 540 log.info( 541 "Output: " 542 + str(self.get_output()) 543 + " [" 544 + str(str(self.get_output_format())) 545 + "]" 546 ) 547 log.info("Config: ") 548 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 549 "\n" 550 ): 551 log.info("\t" + str(d)) 552 log.info("Param: ") 553 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 554 "\n" 555 ): 556 log.info("\t" + str(d)) 557 log.info("Sample list: " + str(self.get_header_sample_list())) 558 log.info("Dataframe: ") 559 for d in str(df).split("\n"): 560 log.info("\t" + str(d)) 561 562 # garbage collector 563 del df 564 gc.collect() 565 566 return None
The function prints the input, output, config, and dataframe of the current object
568 def get_stats(self) -> dict: 569 """ 570 The `get_stats` function calculates and returns various statistics of the current object, 571 including information about the input file, variants, samples, header fields, quality, and 572 SNVs/InDels. 573 :return: a dictionary containing various statistics of the current object. The dictionary has 574 the following structure: 575 """ 576 577 # Log 578 log.info(f"Stats Calculation...") 579 580 # table varaints 581 table_variants_from = self.get_table_variants() 582 583 # stats dict 584 stats = {"Infos": {}} 585 586 ### File 587 input_file = self.get_input() 588 stats["Infos"]["Input file"] = input_file 589 590 # Header 591 header_infos = self.get_header().infos 592 header_formats = self.get_header().formats 593 header_infos_list = list(header_infos) 594 header_formats_list = list(header_formats) 595 596 ### Variants 597 598 stats["Variants"] = {} 599 600 # Variants by chr 601 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 602 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 603 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 604 by=["CHROM"], kind="quicksort" 605 ) 606 607 # Total number of variants 608 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 609 610 # Calculate percentage 611 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 612 lambda x: (x / nb_of_variants) 613 ) 614 615 stats["Variants"]["Number of variants by chromosome"] = ( 616 nb_of_variants_by_chrom.to_dict(orient="index") 617 ) 618 619 stats["Infos"]["Number of variants"] = int(nb_of_variants) 620 621 ### Samples 622 623 # Init 624 samples = {} 625 nb_of_samples = 0 626 627 # Check Samples 628 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 629 log.debug(f"Check samples...") 630 for sample in self.get_header_sample_list(): 631 sql_query_samples = f""" 632 SELECT '{sample}' as sample, 633 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 634 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 635 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 636 FROM {table_variants_from} 637 WHERE ( 638 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 639 AND 640 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 641 ) 642 GROUP BY genotype 643 """ 644 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 645 sample_genotype_count = sql_query_genotype_df["count"].sum() 646 if len(sql_query_genotype_df): 647 nb_of_samples += 1 648 samples[f"{sample} - {sample_genotype_count} variants"] = ( 649 sql_query_genotype_df.to_dict(orient="index") 650 ) 651 652 stats["Samples"] = samples 653 stats["Infos"]["Number of samples"] = nb_of_samples 654 655 # # 656 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 657 # stats["Infos"]["Number of samples"] = nb_of_samples 658 # elif nb_of_samples: 659 # stats["Infos"]["Number of samples"] = "not a VCF format" 660 661 ### INFO and FORMAT fields 662 header_types_df = {} 663 header_types_list = { 664 "List of INFO fields": header_infos, 665 "List of FORMAT fields": header_formats, 666 } 667 i = 0 668 for header_type in header_types_list: 669 670 header_type_infos = header_types_list.get(header_type) 671 header_infos_dict = {} 672 673 for info in header_type_infos: 674 675 i += 1 676 header_infos_dict[i] = {} 677 678 # ID 679 header_infos_dict[i]["id"] = info 680 681 # num 682 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 683 if header_type_infos[info].num in genotype_map.keys(): 684 header_infos_dict[i]["Number"] = genotype_map.get( 685 header_type_infos[info].num 686 ) 687 else: 688 header_infos_dict[i]["Number"] = header_type_infos[info].num 689 690 # type 691 if header_type_infos[info].type: 692 header_infos_dict[i]["Type"] = header_type_infos[info].type 693 else: 694 header_infos_dict[i]["Type"] = "." 695 696 # desc 697 if header_type_infos[info].desc != None: 698 header_infos_dict[i]["Description"] = header_type_infos[info].desc 699 else: 700 header_infos_dict[i]["Description"] = "" 701 702 if len(header_infos_dict): 703 header_types_df[header_type] = pd.DataFrame.from_dict( 704 header_infos_dict, orient="index" 705 ).to_dict(orient="index") 706 707 # Stats 708 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 709 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 710 stats["Header"] = header_types_df 711 712 ### QUAL 713 if "QUAL" in self.get_header_columns(): 714 sql_query_qual = f""" 715 SELECT 716 avg(CAST(QUAL AS INTEGER)) AS Average, 717 min(CAST(QUAL AS INTEGER)) AS Minimum, 718 max(CAST(QUAL AS INTEGER)) AS Maximum, 719 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 720 median(CAST(QUAL AS INTEGER)) AS Median, 721 variance(CAST(QUAL AS INTEGER)) AS Variance 722 FROM {table_variants_from} 723 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 724 """ 725 726 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 727 stats["Quality"] = {"Stats": qual} 728 729 ### SNV and InDel 730 731 sql_query_snv = f""" 732 733 SELECT Type, count FROM ( 734 735 SELECT 736 'Total' AS Type, 737 count(*) AS count 738 FROM {table_variants_from} 739 740 UNION 741 742 SELECT 743 'MNV' AS Type, 744 count(*) AS count 745 FROM {table_variants_from} 746 WHERE len(REF) > 1 AND len(ALT) > 1 747 AND len(REF) = len(ALT) 748 749 UNION 750 751 SELECT 752 'InDel' AS Type, 753 count(*) AS count 754 FROM {table_variants_from} 755 WHERE len(REF) > 1 OR len(ALT) > 1 756 AND len(REF) != len(ALT) 757 758 UNION 759 760 SELECT 761 'SNV' AS Type, 762 count(*) AS count 763 FROM {table_variants_from} 764 WHERE len(REF) = 1 AND len(ALT) = 1 765 766 ) 767 768 ORDER BY count DESC 769 770 """ 771 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 772 773 sql_query_snv_substitution = f""" 774 SELECT 775 concat(REF, '>', ALT) AS 'Substitution', 776 count(*) AS count 777 FROM {table_variants_from} 778 WHERE len(REF) = 1 AND len(ALT) = 1 779 GROUP BY REF, ALT 780 ORDER BY count(*) DESC 781 """ 782 snv_substitution = ( 783 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 784 ) 785 stats["Variants"]["Counts"] = snv_indel 786 stats["Variants"]["Substitutions"] = snv_substitution 787 788 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
790 def stats_to_file(self, file: str = None) -> str: 791 """ 792 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 793 into a JSON object, and writes the JSON object to the specified file. 794 795 :param file: The `file` parameter is a string that represents the file path where the JSON data 796 will be written 797 :type file: str 798 :return: the name of the file that was written to. 799 """ 800 801 # Get stats 802 stats = self.get_stats() 803 804 # Serializing json 805 json_object = json.dumps(stats, indent=4) 806 807 # Writing to sample.json 808 with open(file, "w") as outfile: 809 outfile.write(json_object) 810 811 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
813 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 814 """ 815 The `print_stats` function generates a markdown file and prints the statistics contained in a 816 JSON file in a formatted manner. 817 818 :param output_file: The `output_file` parameter is a string that specifies the path and filename 819 of the output file where the stats will be printed in Markdown format. If no `output_file` is 820 provided, a temporary directory will be created and the stats will be saved in a file named 821 "stats.md" within that 822 :type output_file: str 823 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 824 file where the statistics will be saved. If no value is provided, a temporary directory will be 825 created and a default file name "stats.json" will be used 826 :type json_file: str 827 :return: The function `print_stats` does not return any value. It has a return type annotation 828 of `None`. 829 """ 830 831 # Full path 832 output_file = full_path(output_file) 833 json_file = full_path(json_file) 834 835 with tempfile.TemporaryDirectory() as tmpdir: 836 837 # Files 838 if not output_file: 839 output_file = os.path.join(tmpdir, "stats.md") 840 if not json_file: 841 json_file = os.path.join(tmpdir, "stats.json") 842 843 # Create folders 844 if not os.path.exists(os.path.dirname(output_file)): 845 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 846 if not os.path.exists(os.path.dirname(json_file)): 847 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 848 849 # Create stats JSON file 850 stats_file = self.stats_to_file(file=json_file) 851 852 # Print stats file 853 with open(stats_file) as f: 854 stats = yaml.safe_load(f) 855 856 # Output 857 output_title = [] 858 output_index = [] 859 output = [] 860 861 # Title 862 output_title.append("# HOWARD Stats") 863 864 # Index 865 output_index.append("## Index") 866 867 # Process sections 868 for section in stats: 869 infos = stats.get(section) 870 section_link = "#" + section.lower().replace(" ", "-") 871 output.append(f"## {section}") 872 output_index.append(f"- [{section}]({section_link})") 873 874 if len(infos): 875 for info in infos: 876 try: 877 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 878 is_df = True 879 except: 880 try: 881 df = pd.DataFrame.from_dict( 882 json.loads((infos.get(info))), orient="index" 883 ) 884 is_df = True 885 except: 886 is_df = False 887 if is_df: 888 output.append(f"### {info}") 889 info_link = "#" + info.lower().replace(" ", "-") 890 output_index.append(f" - [{info}]({info_link})") 891 output.append(f"{df.to_markdown(index=False)}") 892 else: 893 output.append(f"- {info}: {infos.get(info)}") 894 else: 895 output.append(f"NA") 896 897 # Write stats in markdown file 898 with open(output_file, "w") as fp: 899 for item in output_title: 900 fp.write("%s\n" % item) 901 for item in output_index: 902 fp.write("%s\n" % item) 903 for item in output: 904 fp.write("%s\n" % item) 905 906 # Output stats in markdown 907 print("") 908 print("\n\n".join(output_title)) 909 print("") 910 print("\n\n".join(output)) 911 print("") 912 913 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
915 def get_input(self) -> str: 916 """ 917 It returns the value of the input variable. 918 :return: The input is being returned. 919 """ 920 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
922 def get_input_format(self, input_file: str = None) -> str: 923 """ 924 This function returns the format of the input variable, either from the provided input file or 925 by prompting for input. 926 927 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 928 represents the file path of the input file. If no `input_file` is provided when calling the 929 method, it will default to `None` 930 :type input_file: str 931 :return: The format of the input variable is being returned. 932 """ 933 934 if not input_file: 935 input_file = self.get_input() 936 input_format = get_file_format(input_file) 937 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
939 def get_input_compressed(self, input_file: str = None) -> str: 940 """ 941 The function `get_input_compressed` returns the format of the input variable after compressing 942 it. 943 944 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 945 that represents the file path of the input file. If no `input_file` is provided when calling the 946 method, it will default to `None` and the method will then call `self.get_input()` to 947 :type input_file: str 948 :return: The function `get_input_compressed` returns the compressed format of the input 949 variable. 950 """ 951 952 if not input_file: 953 input_file = self.get_input() 954 input_compressed = get_file_compressed(input_file) 955 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
957 def get_output(self) -> str: 958 """ 959 It returns the output of the neuron. 960 :return: The output of the neural network. 961 """ 962 963 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
965 def get_output_format(self, output_file: str = None) -> str: 966 """ 967 The function `get_output_format` returns the format of the input variable or the output file if 968 provided. 969 970 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 971 that represents the file path of the output file. If no `output_file` is provided when calling 972 the method, it will default to the output obtained from the `get_output` method of the class 973 instance. The 974 :type output_file: str 975 :return: The format of the input variable is being returned. 976 """ 977 978 if not output_file: 979 output_file = self.get_output() 980 output_format = get_file_format(output_file) 981 982 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
984 def get_config(self) -> dict: 985 """ 986 It returns the config 987 :return: The config variable is being returned. 988 """ 989 return self.config
It returns the config
Returns
The config variable is being returned.
991 def get_param(self) -> dict: 992 """ 993 It returns the param 994 :return: The param variable is being returned. 995 """ 996 return self.param
It returns the param
Returns
The param variable is being returned.
998 def get_connexion_db(self) -> str: 999 """ 1000 It returns the connexion_db attribute of the object 1001 :return: The connexion_db is being returned. 1002 """ 1003 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1005 def get_prefix(self) -> str: 1006 """ 1007 It returns the prefix of the object. 1008 :return: The prefix is being returned. 1009 """ 1010 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1012 def get_table_variants(self, clause: str = "select") -> str: 1013 """ 1014 This function returns the table_variants attribute of the object 1015 1016 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1017 defaults to select (optional) 1018 :return: The table_variants attribute of the object. 1019 """ 1020 1021 # Access 1022 access = self.get_config().get("access", None) 1023 1024 # Clauses "select", "where", "update" 1025 if clause in ["select", "where", "update"]: 1026 table_variants = self.table_variants 1027 # Clause "from" 1028 elif clause in ["from"]: 1029 # For Read Only 1030 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1031 input_file = self.get_input() 1032 table_variants = f"'{input_file}' as variants" 1033 # For Read Write 1034 else: 1035 table_variants = f"{self.table_variants} as variants" 1036 else: 1037 table_variants = self.table_variants 1038 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1040 def get_tmp_dir(self) -> str: 1041 """ 1042 The function `get_tmp_dir` returns the temporary directory path based on configuration 1043 parameters or a default path. 1044 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1045 configuration, parameters, and a default value of "/tmp". 1046 """ 1047 1048 return get_tmp( 1049 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1050 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1052 def get_connexion_type(self) -> str: 1053 """ 1054 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1055 1056 :return: The connexion type is being returned. 1057 """ 1058 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1060 def get_connexion(self): 1061 """ 1062 It returns the connection object 1063 1064 :return: The connection object. 1065 """ 1066 return self.conn
It returns the connection object
Returns
The connection object.
1068 def close_connexion(self) -> None: 1069 """ 1070 This function closes the connection to the database. 1071 :return: The connection is being closed. 1072 """ 1073 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1075 def get_header(self, type: str = "vcf"): 1076 """ 1077 This function returns the header of the VCF file as a list of strings 1078 1079 :param type: the type of header you want to get, defaults to vcf (optional) 1080 :return: The header of the vcf file. 1081 """ 1082 1083 if self.header_vcf: 1084 if type == "vcf": 1085 return self.header_vcf 1086 elif type == "list": 1087 return self.header_list 1088 else: 1089 if type == "vcf": 1090 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1091 return header 1092 elif type == "list": 1093 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1095 def get_header_length(self, file: str = None) -> int: 1096 """ 1097 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1098 line. 1099 1100 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1101 header file. If this argument is provided, the function will read the header from the specified 1102 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1103 :type file: str 1104 :return: the length of the header list, excluding the #CHROM line. 1105 """ 1106 1107 if file: 1108 return len(self.read_vcf_header_file(file=file)) - 1 1109 elif self.get_header(type="list"): 1110 return len(self.get_header(type="list")) - 1 1111 else: 1112 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1114 def get_header_columns(self) -> str: 1115 """ 1116 This function returns the header list of a VCF 1117 1118 :return: The length of the header list. 1119 """ 1120 if self.get_header(): 1121 return self.get_header(type="list")[-1] 1122 else: 1123 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1125 def get_header_columns_as_list(self) -> list: 1126 """ 1127 This function returns the header list of a VCF 1128 1129 :return: The length of the header list. 1130 """ 1131 if self.get_header(): 1132 return self.get_header_columns().strip().split("\t") 1133 else: 1134 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1136 def get_header_columns_as_sql(self) -> str: 1137 """ 1138 This function retruns header length (without #CHROM line) 1139 1140 :return: The length of the header list. 1141 """ 1142 sql_column_list = [] 1143 for col in self.get_header_columns_as_list(): 1144 sql_column_list.append(f'"{col}"') 1145 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1147 def get_header_sample_list( 1148 self, check: bool = False, samples: list = None, samples_force: bool = False 1149 ) -> list: 1150 """ 1151 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1152 checking and filtering based on input parameters. 1153 1154 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1155 parameter that determines whether to check if the samples in the list are properly defined as 1156 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1157 list is defined as a, defaults to False 1158 :type check: bool (optional) 1159 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1160 allows you to specify a subset of samples from the header. If you provide a list of sample 1161 names, the function will check if each sample is defined in the header. If a sample is not found 1162 in the 1163 :type samples: list 1164 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1165 a boolean parameter that determines whether to force the function to return the sample list 1166 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1167 function will return the sample list without performing, defaults to False 1168 :type samples_force: bool (optional) 1169 :return: The function `get_header_sample_list` returns a list of samples based on the input 1170 parameters and conditions specified in the function. 1171 """ 1172 1173 # Init 1174 samples_list = [] 1175 1176 if samples is None: 1177 samples_list = self.header_vcf.samples 1178 else: 1179 samples_checked = [] 1180 for sample in samples: 1181 if sample in self.header_vcf.samples: 1182 samples_checked.append(sample) 1183 else: 1184 log.warning(f"Sample '{sample}' not defined in header") 1185 samples_list = samples_checked 1186 1187 # Force sample list without checking if is_genotype_column 1188 if samples_force: 1189 log.warning(f"Samples {samples_list} not checked if genotypes") 1190 return samples_list 1191 1192 if check: 1193 samples_checked = [] 1194 for sample in samples_list: 1195 if self.is_genotype_column(column=sample): 1196 samples_checked.append(sample) 1197 else: 1198 log.warning( 1199 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1200 ) 1201 samples_list = samples_checked 1202 1203 # Return samples list 1204 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1206 def is_genotype_column(self, column: str = None) -> bool: 1207 """ 1208 This function checks if a given column is a genotype column in a database. 1209 1210 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1211 represents the column name in a database table. This method checks if the specified column is a 1212 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1213 method of 1214 :type column: str 1215 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1216 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1217 column name and returns the result. If the `column` parameter is None, it returns False. 1218 """ 1219 1220 if column is not None: 1221 return Database(database=self.get_input()).is_genotype_column(column=column) 1222 else: 1223 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1225 def get_verbose(self) -> bool: 1226 """ 1227 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1228 exist 1229 1230 :return: The value of the key "verbose" in the config dictionary. 1231 """ 1232 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1234 def get_connexion_format(self) -> str: 1235 """ 1236 It returns the connexion format of the object. 1237 :return: The connexion_format is being returned. 1238 """ 1239 connexion_format = self.connexion_format 1240 if connexion_format not in ["duckdb", "sqlite"]: 1241 log.error(f"Unknown connexion format {connexion_format}") 1242 raise ValueError(f"Unknown connexion format {connexion_format}") 1243 else: 1244 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1246 def insert_file_to_table( 1247 self, 1248 file, 1249 columns: str, 1250 header_len: int = 0, 1251 sep: str = "\t", 1252 chunksize: int = 1000000, 1253 ) -> None: 1254 """ 1255 The function reads a file in chunks and inserts each chunk into a table based on the specified 1256 database format. 1257 1258 :param file: The `file` parameter is the file that you want to load into a table. It should be 1259 the path to the file on your system 1260 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1261 should contain the names of the columns in the table where the data will be inserted. The column 1262 names should be separated by commas within the string. For example, if you have columns named 1263 "id", "name 1264 :type columns: str 1265 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1266 the number of lines to skip at the beginning of the file before reading the actual data. This 1267 parameter allows you to skip any header information present in the file before processing the 1268 data, defaults to 0 1269 :type header_len: int (optional) 1270 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1271 separator character that is used in the file being read. In this case, the default separator is 1272 set to `\t`, which represents a tab character. You can change this parameter to a different 1273 separator character if, defaults to \t 1274 :type sep: str (optional) 1275 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1276 when processing the file in chunks. In the provided code snippet, the default value for 1277 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1278 to 1000000 1279 :type chunksize: int (optional) 1280 """ 1281 1282 # Config 1283 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1284 connexion_format = self.get_connexion_format() 1285 1286 log.debug("chunksize: " + str(chunksize)) 1287 1288 if chunksize: 1289 for chunk in pd.read_csv( 1290 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1291 ): 1292 if connexion_format in ["duckdb"]: 1293 sql_insert_into = ( 1294 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1295 ) 1296 self.conn.execute(sql_insert_into) 1297 elif connexion_format in ["sqlite"]: 1298 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1300 def load_data( 1301 self, 1302 input_file: str = None, 1303 drop_variants_table: bool = False, 1304 sample_size: int = 20480, 1305 ) -> None: 1306 """ 1307 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1308 table before loading the data and specify a sample size. 1309 1310 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1311 table 1312 :type input_file: str 1313 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1314 determines whether the variants table should be dropped before loading the data. If set to 1315 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1316 not be dropped, defaults to False 1317 :type drop_variants_table: bool (optional) 1318 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1319 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1320 20480 1321 :type sample_size: int (optional) 1322 """ 1323 1324 log.info("Loading...") 1325 1326 # change input file 1327 if input_file: 1328 self.set_input(input_file) 1329 self.set_header() 1330 1331 # drop variants table 1332 if drop_variants_table: 1333 self.drop_variants_table() 1334 1335 # get table variants 1336 table_variants = self.get_table_variants() 1337 1338 # Access 1339 access = self.get_config().get("access", None) 1340 log.debug(f"access: {access}") 1341 1342 # Input format and compress 1343 input_format = self.get_input_format() 1344 input_compressed = self.get_input_compressed() 1345 log.debug(f"input_format: {input_format}") 1346 log.debug(f"input_compressed: {input_compressed}") 1347 1348 # input_compressed_format 1349 if input_compressed: 1350 input_compressed_format = "gzip" 1351 else: 1352 input_compressed_format = "none" 1353 log.debug(f"input_compressed_format: {input_compressed_format}") 1354 1355 # Connexion format 1356 connexion_format = self.get_connexion_format() 1357 1358 # Sample size 1359 if not sample_size: 1360 sample_size = -1 1361 log.debug(f"sample_size: {sample_size}") 1362 1363 # Load data 1364 log.debug(f"Load Data from {input_format}") 1365 1366 # DuckDB connexion 1367 if connexion_format in ["duckdb"]: 1368 1369 # Database already exists 1370 if self.input_format in ["db", "duckdb"]: 1371 1372 if connexion_format in ["duckdb"]: 1373 log.debug(f"Input file format '{self.input_format}' duckDB") 1374 else: 1375 log.error( 1376 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1377 ) 1378 raise ValueError( 1379 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1380 ) 1381 1382 # Load from existing database format 1383 else: 1384 1385 try: 1386 # Create Table or View 1387 database = Database(database=self.input) 1388 sql_from = database.get_sql_from(sample_size=sample_size) 1389 1390 if access in ["RO"]: 1391 sql_load = ( 1392 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1393 ) 1394 else: 1395 sql_load = ( 1396 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1397 ) 1398 self.conn.execute(sql_load) 1399 1400 except: 1401 # Format not available 1402 log.error(f"Input file format '{self.input_format}' not available") 1403 raise ValueError( 1404 f"Input file format '{self.input_format}' not available" 1405 ) 1406 1407 # SQLite connexion 1408 elif connexion_format in ["sqlite"] and input_format in [ 1409 "vcf", 1410 "tsv", 1411 "csv", 1412 "psv", 1413 ]: 1414 1415 # Main structure 1416 structure = { 1417 "#CHROM": "VARCHAR", 1418 "POS": "INTEGER", 1419 "ID": "VARCHAR", 1420 "REF": "VARCHAR", 1421 "ALT": "VARCHAR", 1422 "QUAL": "VARCHAR", 1423 "FILTER": "VARCHAR", 1424 "INFO": "VARCHAR", 1425 } 1426 1427 # Strcuture with samples 1428 structure_complete = structure 1429 if self.get_header_sample_list(): 1430 structure["FORMAT"] = "VARCHAR" 1431 for sample in self.get_header_sample_list(): 1432 structure_complete[sample] = "VARCHAR" 1433 1434 # Columns list for create and insert 1435 sql_create_table_columns = [] 1436 sql_create_table_columns_list = [] 1437 for column in structure_complete: 1438 column_type = structure_complete[column] 1439 sql_create_table_columns.append( 1440 f'"{column}" {column_type} default NULL' 1441 ) 1442 sql_create_table_columns_list.append(f'"{column}"') 1443 1444 # Create database 1445 log.debug(f"Create Table {table_variants}") 1446 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1447 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1448 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1449 self.conn.execute(sql_create_table) 1450 1451 # chunksize define length of file chunk load file 1452 chunksize = 100000 1453 1454 # delimiter 1455 delimiter = file_format_delimiters.get(input_format, "\t") 1456 1457 # Load the input file 1458 with open(self.input, "rt") as input_file: 1459 1460 # Use the appropriate file handler based on the input format 1461 if input_compressed: 1462 input_file = bgzf.open(self.input, "rt") 1463 if input_format in ["vcf"]: 1464 header_len = self.get_header_length() 1465 else: 1466 header_len = 0 1467 1468 # Insert the file contents into a table 1469 self.insert_file_to_table( 1470 input_file, 1471 columns=sql_create_table_columns_list_sql, 1472 header_len=header_len, 1473 sep=delimiter, 1474 chunksize=chunksize, 1475 ) 1476 1477 else: 1478 log.error( 1479 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1480 ) 1481 raise ValueError( 1482 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1483 ) 1484 1485 # Explode INFOS fields into table fields 1486 if self.get_explode_infos(): 1487 self.explode_infos( 1488 prefix=self.get_explode_infos_prefix(), 1489 fields=self.get_explode_infos_fields(), 1490 force=True, 1491 ) 1492 1493 # Create index after insertion 1494 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1496 def get_explode_infos(self) -> bool: 1497 """ 1498 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1499 to False if it is not set. 1500 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1501 value. If the parameter is not present, it will return False. 1502 """ 1503 1504 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1506 def get_explode_infos_fields( 1507 self, 1508 explode_infos_fields: str = None, 1509 remove_fields_not_in_header: bool = False, 1510 ) -> list: 1511 """ 1512 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1513 the input parameter `explode_infos_fields`. 1514 1515 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1516 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1517 comma-separated list of field names to explode 1518 :type explode_infos_fields: str 1519 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1520 flag that determines whether to remove fields that are not present in the header. If it is set 1521 to `True`, any field that is not in the header will be excluded from the list of exploded 1522 information fields. If it is set to `, defaults to False 1523 :type remove_fields_not_in_header: bool (optional) 1524 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1525 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1526 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1527 Otherwise, it returns a list of exploded information fields after removing any spaces and 1528 splitting the string by commas. 1529 """ 1530 1531 # If no fields, get it in param 1532 if not explode_infos_fields: 1533 explode_infos_fields = ( 1534 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1535 ) 1536 1537 # If no fields, defined as all fields in header using keyword 1538 if not explode_infos_fields: 1539 explode_infos_fields = "*" 1540 1541 # If fields list not empty 1542 if explode_infos_fields: 1543 1544 # Input fields list 1545 if isinstance(explode_infos_fields, str): 1546 fields_input = explode_infos_fields.split(",") 1547 elif isinstance(explode_infos_fields, list): 1548 fields_input = explode_infos_fields 1549 else: 1550 fields_input = [] 1551 1552 # Fields list without * keyword 1553 fields_without_all = fields_input.copy() 1554 if "*".casefold() in (item.casefold() for item in fields_without_all): 1555 fields_without_all.remove("*") 1556 1557 # Fields in header 1558 fields_in_header = sorted(list(set(self.get_header().infos))) 1559 1560 # Construct list of fields 1561 fields_output = [] 1562 for field in fields_input: 1563 1564 # Strip field 1565 field = field.strip() 1566 1567 # format keyword * in regex 1568 if field.upper() in ["*"]: 1569 field = ".*" 1570 1571 # Find all fields with pattern 1572 r = re.compile(field) 1573 fields_search = sorted(list(filter(r.match, fields_in_header))) 1574 1575 # Remove fields input from search 1576 if field in fields_search: 1577 fields_search = [field] 1578 elif fields_search != [field]: 1579 fields_search = sorted( 1580 list(set(fields_search).difference(fields_input)) 1581 ) 1582 1583 # If field is not in header (avoid not well formatted header) 1584 if not fields_search and not remove_fields_not_in_header: 1585 fields_search = [field] 1586 1587 # Add found fields 1588 for new_field in fields_search: 1589 # Add field, if not already exists, and if it is in header (if asked) 1590 if ( 1591 new_field not in fields_output 1592 and ( 1593 not remove_fields_not_in_header 1594 or new_field in fields_in_header 1595 ) 1596 and new_field not in [".*"] 1597 ): 1598 fields_output.append(new_field) 1599 1600 return fields_output 1601 1602 else: 1603 1604 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1606 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1607 """ 1608 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1609 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1610 not provided. 1611 1612 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1613 prefix to be used for exploding or expanding information 1614 :type explode_infos_prefix: str 1615 :return: the value of the variable `explode_infos_prefix`. 1616 """ 1617 1618 if not explode_infos_prefix: 1619 explode_infos_prefix = ( 1620 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1621 ) 1622 1623 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1625 def add_column( 1626 self, 1627 table_name, 1628 column_name, 1629 column_type, 1630 default_value=None, 1631 drop: bool = False, 1632 ) -> dict: 1633 """ 1634 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1635 doesn't already exist. 1636 1637 :param table_name: The name of the table to which you want to add a column 1638 :param column_name: The parameter "column_name" is the name of the column that you want to add 1639 to the table 1640 :param column_type: The `column_type` parameter specifies the data type of the column that you 1641 want to add to the table. It should be a string that represents the desired data type, such as 1642 "INTEGER", "TEXT", "REAL", etc 1643 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1644 default value for the newly added column. If a default value is provided, it will be assigned to 1645 the column for any existing rows that do not have a value for that column 1646 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1647 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1648 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1649 to False 1650 :type drop: bool (optional) 1651 :return: a boolean value indicating whether the column was successfully added to the table. 1652 """ 1653 1654 # added 1655 added = False 1656 dropped = False 1657 1658 # Check if the column already exists in the table 1659 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1660 columns = self.get_query_to_df(query).columns.tolist() 1661 if column_name.upper() in [c.upper() for c in columns]: 1662 log.debug( 1663 f"The {column_name} column already exists in the {table_name} table" 1664 ) 1665 if drop: 1666 self.drop_column(table_name=table_name, column_name=column_name) 1667 dropped = True 1668 else: 1669 return None 1670 else: 1671 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1672 1673 # Add column in table 1674 add_column_query = ( 1675 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1676 ) 1677 if default_value is not None: 1678 add_column_query += f" DEFAULT {default_value}" 1679 self.execute_query(add_column_query) 1680 added = not dropped 1681 log.debug( 1682 f"The {column_name} column was successfully added to the {table_name} table" 1683 ) 1684 1685 if added: 1686 added_column = { 1687 "table_name": table_name, 1688 "column_name": column_name, 1689 "column_type": column_type, 1690 "default_value": default_value, 1691 } 1692 else: 1693 added_column = None 1694 1695 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1697 def drop_column( 1698 self, column: dict = None, table_name: str = None, column_name: str = None 1699 ) -> bool: 1700 """ 1701 The `drop_column` function drops a specified column from a given table in a database and returns 1702 True if the column was successfully dropped, and False if the column does not exist in the 1703 table. 1704 1705 :param column: The `column` parameter is a dictionary that contains information about the column 1706 you want to drop. It has two keys: 1707 :type column: dict 1708 :param table_name: The `table_name` parameter is the name of the table from which you want to 1709 drop a column 1710 :type table_name: str 1711 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1712 from the table 1713 :type column_name: str 1714 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1715 and False if the column does not exist in the table. 1716 """ 1717 1718 # Find column infos 1719 if column: 1720 if isinstance(column, dict): 1721 table_name = column.get("table_name", None) 1722 column_name = column.get("column_name", None) 1723 elif isinstance(column, str): 1724 table_name = self.get_table_variants() 1725 column_name = column 1726 else: 1727 table_name = None 1728 column_name = None 1729 1730 if not table_name and not column_name: 1731 return False 1732 1733 # Removed 1734 removed = False 1735 1736 # Check if the column already exists in the table 1737 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1738 columns = self.get_query_to_df(query).columns.tolist() 1739 if column_name in columns: 1740 log.debug(f"The {column_name} column exists in the {table_name} table") 1741 else: 1742 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1743 return False 1744 1745 # Add column in table # ALTER TABLE integers DROP k 1746 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1747 self.execute_query(add_column_query) 1748 removed = True 1749 log.debug( 1750 f"The {column_name} column was successfully dropped to the {table_name} table" 1751 ) 1752 1753 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1755 def explode_infos( 1756 self, 1757 prefix: str = None, 1758 create_index: bool = False, 1759 fields: list = None, 1760 force: bool = False, 1761 proccess_all_fields_together: bool = False, 1762 table: str = None, 1763 ) -> list: 1764 """ 1765 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1766 individual columns, returning a list of added columns. 1767 1768 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1769 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1770 `self.get_explode_infos_prefix()` as the prefix 1771 :type prefix: str 1772 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1773 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1774 `False`, indexes will not be created. The default value is `False`, defaults to False 1775 :type create_index: bool (optional) 1776 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1777 that you want to explode into individual columns. If this parameter is not provided, all INFO 1778 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1779 a list to the ` 1780 :type fields: list 1781 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1782 determines whether to drop and recreate a column if it already exists in the table. If `force` 1783 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1784 defaults to False 1785 :type force: bool (optional) 1786 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1787 flag that determines whether to process all the INFO fields together or individually. If set to 1788 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1789 be processed individually. The default value is, defaults to False 1790 :type proccess_all_fields_together: bool (optional) 1791 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1792 of the table where the exploded INFO fields will be added as individual columns. If you provide 1793 a value for the `table` parameter, the function will use that table name. If the `table` 1794 parameter is 1795 :type table: str 1796 :return: The `explode_infos` function returns a list of added columns. 1797 """ 1798 1799 # drop indexes 1800 self.drop_indexes() 1801 1802 # connexion format 1803 connexion_format = self.get_connexion_format() 1804 1805 # Access 1806 access = self.get_config().get("access", None) 1807 1808 # Added columns 1809 added_columns = [] 1810 1811 if access not in ["RO"]: 1812 1813 # prefix 1814 if prefix in [None, True] or not isinstance(prefix, str): 1815 if self.get_explode_infos_prefix() not in [None, True]: 1816 prefix = self.get_explode_infos_prefix() 1817 else: 1818 prefix = "INFO/" 1819 1820 # table variants 1821 if table is not None: 1822 table_variants = table 1823 else: 1824 table_variants = self.get_table_variants(clause="select") 1825 1826 # extra infos 1827 try: 1828 extra_infos = self.get_extra_infos() 1829 except: 1830 extra_infos = [] 1831 1832 # Header infos 1833 header_infos = self.get_header().infos 1834 1835 log.debug( 1836 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1837 ) 1838 1839 sql_info_alter_table_array = [] 1840 1841 # Info fields to check 1842 fields_list = list(header_infos) 1843 if fields: 1844 fields_list += fields 1845 fields_list = set(fields_list) 1846 1847 # If no fields 1848 if not fields: 1849 fields = [] 1850 1851 # Translate fields if patterns 1852 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1853 1854 for info in fields: 1855 1856 info_id_sql = prefix + info 1857 1858 if ( 1859 info in fields_list 1860 or prefix + info in fields_list 1861 or info in extra_infos 1862 ): 1863 1864 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1865 1866 if info in header_infos: 1867 info_type = header_infos[info].type 1868 info_num = header_infos[info].num 1869 else: 1870 info_type = "String" 1871 info_num = 0 1872 1873 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1874 if info_num != 1: 1875 type_sql = "VARCHAR" 1876 1877 # Add field 1878 added_column = self.add_column( 1879 table_name=table_variants, 1880 column_name=info_id_sql, 1881 column_type=type_sql, 1882 default_value="null", 1883 drop=force, 1884 ) 1885 1886 if added_column: 1887 added_columns.append(added_column) 1888 1889 if added_column or force: 1890 1891 # add field to index 1892 self.index_additionnal_fields.append(info_id_sql) 1893 1894 # Update field array 1895 if connexion_format in ["duckdb"]: 1896 update_info_field = f""" 1897 "{info_id_sql}" = 1898 CASE 1899 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1900 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1901 END 1902 """ 1903 elif connexion_format in ["sqlite"]: 1904 update_info_field = f""" 1905 "{info_id_sql}" = 1906 CASE 1907 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1908 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1909 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1910 END 1911 """ 1912 1913 sql_info_alter_table_array.append(update_info_field) 1914 1915 if sql_info_alter_table_array: 1916 1917 # By chromosomes 1918 try: 1919 chromosomes_list = list( 1920 self.get_query_to_df( 1921 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1922 )["#CHROM"] 1923 ) 1924 except: 1925 chromosomes_list = [None] 1926 1927 for chrom in chromosomes_list: 1928 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1929 1930 # Where clause 1931 where_clause = "" 1932 if chrom and len(chromosomes_list) > 1: 1933 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1934 1935 # Update table 1936 if proccess_all_fields_together: 1937 sql_info_alter_table_array_join = ", ".join( 1938 sql_info_alter_table_array 1939 ) 1940 if sql_info_alter_table_array_join: 1941 sql_info_alter_table = f""" 1942 UPDATE {table_variants} 1943 SET {sql_info_alter_table_array_join} 1944 {where_clause} 1945 """ 1946 log.debug( 1947 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1948 ) 1949 # log.debug(sql_info_alter_table) 1950 self.conn.execute(sql_info_alter_table) 1951 else: 1952 sql_info_alter_num = 0 1953 for sql_info_alter in sql_info_alter_table_array: 1954 sql_info_alter_num += 1 1955 sql_info_alter_table = f""" 1956 UPDATE {table_variants} 1957 SET {sql_info_alter} 1958 {where_clause} 1959 """ 1960 log.debug( 1961 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1962 ) 1963 # log.debug(sql_info_alter_table) 1964 self.conn.execute(sql_info_alter_table) 1965 1966 # create indexes 1967 if create_index: 1968 self.create_indexes() 1969 1970 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1972 def create_indexes(self) -> None: 1973 """ 1974 Create indexes on the table after insertion 1975 """ 1976 1977 # Access 1978 access = self.get_config().get("access", None) 1979 1980 # get table variants 1981 table_variants = self.get_table_variants("FROM") 1982 1983 if self.get_indexing() and access not in ["RO"]: 1984 # Create index 1985 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1986 self.conn.execute(sql_create_table_index) 1987 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1988 self.conn.execute(sql_create_table_index) 1989 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1990 self.conn.execute(sql_create_table_index) 1991 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1992 self.conn.execute(sql_create_table_index) 1993 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1994 self.conn.execute(sql_create_table_index) 1995 for field in self.index_additionnal_fields: 1996 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1997 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
1999 def drop_indexes(self) -> None: 2000 """ 2001 Create indexes on the table after insertion 2002 """ 2003 2004 # Access 2005 access = self.get_config().get("access", None) 2006 2007 # get table variants 2008 table_variants = self.get_table_variants("FROM") 2009 2010 # Get database format 2011 connexion_format = self.get_connexion_format() 2012 2013 if access not in ["RO"]: 2014 if connexion_format in ["duckdb"]: 2015 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2016 elif connexion_format in ["sqlite"]: 2017 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2018 2019 list_indexes = self.conn.execute(sql_list_indexes) 2020 index_names = [row[0] for row in list_indexes.fetchall()] 2021 for index in index_names: 2022 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2023 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2025 def read_vcf_header(self, f) -> list: 2026 """ 2027 It reads the header of a VCF file and returns a list of the header lines 2028 2029 :param f: the file object 2030 :return: The header lines of the VCF file. 2031 """ 2032 2033 header_list = [] 2034 for line in f: 2035 header_list.append(line) 2036 if line.startswith("#CHROM"): 2037 break 2038 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2040 def read_vcf_header_file(self, file: str = None) -> list: 2041 """ 2042 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2043 uncompressed files. 2044 2045 :param file: The `file` parameter is a string that represents the path to the VCF header file 2046 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2047 default to `None` 2048 :type file: str 2049 :return: The function `read_vcf_header_file` returns a list. 2050 """ 2051 2052 if self.get_input_compressed(input_file=file): 2053 with bgzf.open(file, "rt") as f: 2054 return self.read_vcf_header(f=f) 2055 else: 2056 with open(file, "rt") as f: 2057 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2059 def execute_query(self, query: str): 2060 """ 2061 It takes a query as an argument, executes it, and returns the results 2062 2063 :param query: The query to be executed 2064 :return: The result of the query is being returned. 2065 """ 2066 if query: 2067 return self.conn.execute(query) # .fetchall() 2068 else: 2069 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2071 def export_output( 2072 self, 2073 output_file: str | None = None, 2074 output_header: str | None = None, 2075 export_header: bool = True, 2076 query: str | None = None, 2077 parquet_partitions: list | None = None, 2078 chunk_size: int | None = None, 2079 threads: int | None = None, 2080 sort: bool = False, 2081 index: bool = False, 2082 order_by: str | None = None, 2083 ) -> bool: 2084 """ 2085 The `export_output` function exports data from a VCF file to a specified output file in various 2086 formats, including VCF, CSV, TSV, PSV, and Parquet. 2087 2088 :param output_file: The `output_file` parameter is a string that specifies the name of the 2089 output file to be generated by the function. This is where the exported data will be saved 2090 :type output_file: str 2091 :param output_header: The `output_header` parameter is a string that specifies the name of the 2092 file where the header of the VCF file will be exported. If this parameter is not provided, the 2093 header will be exported to a file with the same name as the `output_file` parameter, but with 2094 the extension " 2095 :type output_header: str 2096 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2097 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2098 True, the header will be exported to a file. If `export_header` is False, the header will not 2099 be, defaults to True, if output format is not VCF 2100 :type export_header: bool (optional) 2101 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2102 select specific data from the VCF file before exporting it. If provided, only the data that 2103 matches the query will be exported 2104 :type query: str 2105 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2106 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2107 organize data in a hierarchical directory structure based on the values of one or more columns. 2108 This can improve query performance when working with large datasets 2109 :type parquet_partitions: list 2110 :param chunk_size: The `chunk_size` parameter specifies the number of 2111 records in batch when exporting data in Parquet format. This parameter is used for 2112 partitioning the Parquet file into multiple files. 2113 :type chunk_size: int 2114 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2115 threads to be used during the export process. It determines the level of parallelism and can 2116 improve the performance of the export operation. If not provided, the function will use the 2117 default number of threads 2118 :type threads: int 2119 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2120 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2121 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2122 False 2123 :type sort: bool (optional) 2124 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2125 created on the output file. If `index` is True, an index will be created. If `index` is False, 2126 no index will be created. The default value is False, defaults to False 2127 :type index: bool (optional) 2128 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2129 sorting the output file. This parameter is only applicable when exporting data in VCF format 2130 :type order_by: str 2131 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2132 None if it doesn't. 2133 """ 2134 2135 # Log 2136 log.info("Exporting...") 2137 2138 # Full path 2139 output_file = full_path(output_file) 2140 output_header = full_path(output_header) 2141 2142 # Config 2143 config = self.get_config() 2144 2145 # Param 2146 param = self.get_param() 2147 2148 # Tmp files to remove 2149 tmp_to_remove = [] 2150 2151 # If no output, get it 2152 if not output_file: 2153 output_file = self.get_output() 2154 2155 # If not threads 2156 if not threads: 2157 threads = self.get_threads() 2158 2159 # Auto header name with extension 2160 if export_header or output_header: 2161 if not output_header: 2162 output_header = f"{output_file}.hdr" 2163 # Export header 2164 self.export_header(output_file=output_file) 2165 2166 # Switch off export header if VCF output 2167 output_file_type = get_file_format(output_file) 2168 if output_file_type in ["vcf"]: 2169 export_header = False 2170 tmp_to_remove.append(output_header) 2171 2172 # Chunk size 2173 if not chunk_size: 2174 chunk_size = config.get("chunk_size", None) 2175 2176 # Parquet partition 2177 if not parquet_partitions: 2178 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2179 if parquet_partitions and isinstance(parquet_partitions, str): 2180 parquet_partitions = parquet_partitions.split(",") 2181 2182 # Order by 2183 if not order_by: 2184 order_by = param.get("export", {}).get("order_by", "") 2185 2186 # Header in output 2187 header_in_output = param.get("export", {}).get("include_header", False) 2188 2189 # Database 2190 database_source = self.get_connexion() 2191 2192 # Connexion format 2193 connexion_format = self.get_connexion_format() 2194 2195 # Explode infos 2196 if self.get_explode_infos(): 2197 self.explode_infos( 2198 prefix=self.get_explode_infos_prefix(), 2199 fields=self.get_explode_infos_fields(), 2200 force=False, 2201 ) 2202 2203 # if connexion_format in ["sqlite"] or query: 2204 if connexion_format in ["sqlite"]: 2205 2206 # Export in Parquet 2207 random_tmp = "".join( 2208 random.choice(string.ascii_lowercase) for i in range(10) 2209 ) 2210 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2211 tmp_to_remove.append(database_source) 2212 2213 # Table Variants 2214 table_variants = self.get_table_variants() 2215 2216 # Create export query 2217 sql_query_export_subquery = f""" 2218 SELECT * FROM {table_variants} 2219 """ 2220 2221 # Write source file 2222 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2223 2224 # Create database 2225 database = Database( 2226 database=database_source, 2227 table="variants", 2228 header_file=output_header, 2229 conn_config=self.get_connexion_config(), 2230 ) 2231 2232 # Existing colomns header 2233 existing_columns_header = database.get_header_columns_from_database() 2234 2235 # Sample list 2236 get_samples = self.get_samples() 2237 get_samples_check = self.get_samples_check() 2238 samples_force = get_samples is not None 2239 sample_list = self.get_header_sample_list( 2240 check=get_samples_check, samples=get_samples, samples_force=samples_force 2241 ) 2242 2243 # Export file 2244 database.export( 2245 output_database=output_file, 2246 output_header=output_header, 2247 existing_columns_header=existing_columns_header, 2248 parquet_partitions=parquet_partitions, 2249 chunk_size=chunk_size, 2250 threads=threads, 2251 sort=sort, 2252 index=index, 2253 header_in_output=header_in_output, 2254 order_by=order_by, 2255 query=query, 2256 export_header=export_header, 2257 sample_list=sample_list, 2258 ) 2259 2260 # Remove 2261 remove_if_exists(tmp_to_remove) 2262 2263 return (os.path.exists(output_file) or None) and ( 2264 os.path.exists(output_file) or None 2265 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2267 def get_extra_infos(self, table: str = None) -> list: 2268 """ 2269 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2270 in the header. 2271 2272 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2273 name of the table from which you want to retrieve the extra columns that are not present in the 2274 header. If the `table` parameter is not provided when calling the function, it will default to 2275 using the variants 2276 :type table: str 2277 :return: A list of columns that are in the specified table but not in the header of the table. 2278 """ 2279 2280 header_columns = [] 2281 2282 if not table: 2283 table = self.get_table_variants(clause="from") 2284 header_columns = self.get_header_columns() 2285 2286 # Check all columns in the database 2287 query = f""" SELECT * FROM {table} LIMIT 1 """ 2288 log.debug(f"query {query}") 2289 table_columns = self.get_query_to_df(query).columns.tolist() 2290 extra_columns = [] 2291 2292 # Construct extra infos (not in header) 2293 for column in table_columns: 2294 if column not in header_columns: 2295 extra_columns.append(column) 2296 2297 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2299 def get_extra_infos_sql(self, table: str = None) -> str: 2300 """ 2301 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2302 by double quotes 2303 2304 :param table: The name of the table to get the extra infos from. If None, the default table is 2305 used 2306 :type table: str 2307 :return: A string of the extra infos 2308 """ 2309 2310 return ", ".join( 2311 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2312 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2314 def export_header( 2315 self, 2316 header_name: str = None, 2317 output_file: str = None, 2318 output_file_ext: str = ".hdr", 2319 clean_header: bool = True, 2320 remove_chrom_line: bool = False, 2321 ) -> str: 2322 """ 2323 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2324 specified options, and writes it to a new file. 2325 2326 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2327 this parameter is not specified, the header will be written to the output file 2328 :type header_name: str 2329 :param output_file: The `output_file` parameter in the `export_header` function is used to 2330 specify the name of the output file where the header will be written. If this parameter is not 2331 provided, the header will be written to a temporary file 2332 :type output_file: str 2333 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2334 string that represents the extension of the output header file. By default, it is set to ".hdr" 2335 if not specified by the user. This extension will be appended to the `output_file` name to 2336 create the final, defaults to .hdr 2337 :type output_file_ext: str (optional) 2338 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2339 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2340 `True`, the function will clean the header by modifying certain lines based on a specific 2341 pattern. If `clean_header`, defaults to True 2342 :type clean_header: bool (optional) 2343 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2344 boolean flag that determines whether the #CHROM line should be removed from the header before 2345 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2346 defaults to False 2347 :type remove_chrom_line: bool (optional) 2348 :return: The function `export_header` returns the name of the temporary header file that is 2349 created. 2350 """ 2351 2352 if not header_name and not output_file: 2353 output_file = self.get_output() 2354 2355 if self.get_header(): 2356 2357 # Get header object 2358 header_obj = self.get_header() 2359 2360 # Create database 2361 db_for_header = Database(database=self.get_input()) 2362 2363 # Get real columns in the file 2364 db_header_columns = db_for_header.get_columns() 2365 2366 with tempfile.TemporaryDirectory() as tmpdir: 2367 2368 # Write header file 2369 header_file_tmp = os.path.join(tmpdir, "header") 2370 f = open(header_file_tmp, "w") 2371 vcf.Writer(f, header_obj) 2372 f.close() 2373 2374 # Replace #CHROM line with rel columns 2375 header_list = db_for_header.read_header_file( 2376 header_file=header_file_tmp 2377 ) 2378 header_list[-1] = "\t".join(db_header_columns) 2379 2380 # Remove CHROM line 2381 if remove_chrom_line: 2382 header_list.pop() 2383 2384 # Clean header 2385 if clean_header: 2386 header_list_clean = [] 2387 for head in header_list: 2388 # Clean head for malformed header 2389 head_clean = head 2390 head_clean = re.subn( 2391 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2392 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2393 head_clean, 2394 2, 2395 )[0] 2396 # Write header 2397 header_list_clean.append(head_clean) 2398 header_list = header_list_clean 2399 2400 tmp_header_name = output_file + output_file_ext 2401 2402 f = open(tmp_header_name, "w") 2403 for line in header_list: 2404 f.write(line) 2405 f.close() 2406 2407 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2409 def export_variant_vcf( 2410 self, 2411 vcf_file, 2412 remove_info: bool = False, 2413 add_samples: bool = True, 2414 list_samples: list = [], 2415 where_clause: str = "", 2416 index: bool = False, 2417 threads: int | None = None, 2418 ) -> bool | None: 2419 """ 2420 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2421 remove INFO field, add samples, and control compression and indexing. 2422 2423 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2424 written to. It is the output file that will contain the filtered VCF data based on the specified 2425 parameters 2426 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2427 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2428 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2429 in, defaults to False 2430 :type remove_info: bool (optional) 2431 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2432 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2433 If set to False, the samples will be removed. The default value is True, defaults to True 2434 :type add_samples: bool (optional) 2435 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2436 in the output VCF file. By default, all samples will be included. If you provide a list of 2437 samples, only those samples will be included in the output file 2438 :type list_samples: list 2439 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2440 determines whether or not to create an index for the output VCF file. If `index` is set to 2441 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2442 :type index: bool (optional) 2443 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2444 number of threads to use for exporting the VCF file. It determines how many parallel threads 2445 will be used during the export process. More threads can potentially speed up the export process 2446 by utilizing multiple cores of the processor. If 2447 :type threads: int | None 2448 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2449 method with various parameters including the output file, query, threads, sort flag, and index 2450 flag. The `export_output` method is responsible for exporting the VCF data based on the 2451 specified parameters and configurations provided in the `export_variant_vcf` function. 2452 """ 2453 2454 # Config 2455 config = self.get_config() 2456 2457 # Extract VCF 2458 log.debug("Export VCF...") 2459 2460 # Table variants 2461 table_variants = self.get_table_variants() 2462 2463 # Threads 2464 if not threads: 2465 threads = self.get_threads() 2466 2467 # Info fields 2468 if remove_info: 2469 if not isinstance(remove_info, str): 2470 remove_info = "." 2471 info_field = f"""'{remove_info}' as INFO""" 2472 else: 2473 info_field = "INFO" 2474 2475 # Samples fields 2476 if add_samples: 2477 if not list_samples: 2478 list_samples = self.get_header_sample_list() 2479 if list_samples: 2480 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2481 else: 2482 samples_fields = "" 2483 log.debug(f"samples_fields: {samples_fields}") 2484 else: 2485 samples_fields = "" 2486 2487 # Where clause 2488 if where_clause is None: 2489 where_clause = "" 2490 2491 # Variants 2492 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2493 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2494 log.debug(f"sql_query_select={sql_query_select}") 2495 2496 return self.export_output( 2497 output_file=vcf_file, 2498 output_header=None, 2499 export_header=True, 2500 query=sql_query_select, 2501 parquet_partitions=None, 2502 chunk_size=config.get("chunk_size", None), 2503 threads=threads, 2504 sort=True, 2505 index=index, 2506 order_by=None, 2507 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2509 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2510 """ 2511 It takes a list of commands and runs them in parallel using the number of threads specified 2512 2513 :param commands: A list of commands to run 2514 :param threads: The number of threads to use, defaults to 1 (optional) 2515 """ 2516 2517 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2519 def get_threads(self, default: int = 1) -> int: 2520 """ 2521 This function returns the number of threads to use for a job, with a default value of 1 if not 2522 specified. 2523 2524 :param default: The `default` parameter in the `get_threads` method is used to specify the 2525 default number of threads to use if no specific value is provided. If no value is provided for 2526 the `threads` parameter in the configuration or input parameters, the `default` value will be 2527 used, defaults to 1 2528 :type default: int (optional) 2529 :return: the number of threads to use for the current job. 2530 """ 2531 2532 # Config 2533 config = self.get_config() 2534 2535 # Param 2536 param = self.get_param() 2537 2538 # Input threads 2539 input_thread = param.get("threads", config.get("threads", None)) 2540 2541 # Check threads 2542 if not input_thread: 2543 threads = default 2544 elif int(input_thread) <= 0: 2545 threads = os.cpu_count() 2546 else: 2547 threads = int(input_thread) 2548 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2550 def get_memory(self, default: str = None) -> str: 2551 """ 2552 This function retrieves the memory value from parameters or configuration with a default value 2553 if not found. 2554 2555 :param default: The `get_memory` function takes in a default value as a string parameter. This 2556 default value is used as a fallback in case the `memory` parameter is not provided in the 2557 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2558 the function 2559 :type default: str 2560 :return: The `get_memory` function returns a string value representing the memory parameter. If 2561 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2562 return the default value provided as an argument to the function. 2563 """ 2564 2565 # Config 2566 config = self.get_config() 2567 2568 # Param 2569 param = self.get_param() 2570 2571 # Input threads 2572 input_memory = param.get("memory", config.get("memory", None)) 2573 2574 # Check threads 2575 if input_memory: 2576 memory = input_memory 2577 else: 2578 memory = default 2579 2580 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2582 def update_from_vcf(self, vcf_file: str) -> None: 2583 """ 2584 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2585 2586 :param vcf_file: the path to the VCF file 2587 """ 2588 2589 connexion_format = self.get_connexion_format() 2590 2591 if connexion_format in ["duckdb"]: 2592 self.update_from_vcf_duckdb(vcf_file) 2593 elif connexion_format in ["sqlite"]: 2594 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2596 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2597 """ 2598 It takes a VCF file and updates the INFO column of the variants table in the database with the 2599 INFO column of the VCF file 2600 2601 :param vcf_file: the path to the VCF file 2602 """ 2603 2604 # varaints table 2605 table_variants = self.get_table_variants() 2606 2607 # Loading VCF into temporaire table 2608 skip = self.get_header_length(file=vcf_file) 2609 vcf_df = pd.read_csv( 2610 vcf_file, 2611 sep="\t", 2612 engine="c", 2613 skiprows=skip, 2614 header=0, 2615 low_memory=False, 2616 ) 2617 sql_query_update = f""" 2618 UPDATE {table_variants} as table_variants 2619 SET INFO = concat( 2620 CASE 2621 WHEN INFO NOT IN ('', '.') 2622 THEN INFO 2623 ELSE '' 2624 END, 2625 ( 2626 SELECT 2627 concat( 2628 CASE 2629 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2630 THEN ';' 2631 ELSE '' 2632 END 2633 , 2634 CASE 2635 WHEN table_parquet.INFO NOT IN ('','.') 2636 THEN table_parquet.INFO 2637 ELSE '' 2638 END 2639 ) 2640 FROM vcf_df as table_parquet 2641 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2642 AND table_parquet.\"POS\" = table_variants.\"POS\" 2643 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2644 AND table_parquet.\"REF\" = table_variants.\"REF\" 2645 AND table_parquet.INFO NOT IN ('','.') 2646 ) 2647 ) 2648 ; 2649 """ 2650 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2652 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2653 """ 2654 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2655 table, then updates the INFO column of the variants table with the INFO column of the temporary 2656 table 2657 2658 :param vcf_file: The path to the VCF file you want to update the database with 2659 """ 2660 2661 # Create a temporary table for the VCF 2662 table_vcf = "tmp_vcf" 2663 sql_create = ( 2664 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2665 ) 2666 self.conn.execute(sql_create) 2667 2668 # Loading VCF into temporaire table 2669 vcf_df = pd.read_csv( 2670 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2671 ) 2672 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2673 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2674 2675 # Update table 'variants' with VCF data 2676 # warning: CONCAT as || operator 2677 sql_query_update = f""" 2678 UPDATE variants as table_variants 2679 SET INFO = CASE 2680 WHEN INFO NOT IN ('', '.') 2681 THEN INFO 2682 ELSE '' 2683 END || 2684 ( 2685 SELECT 2686 CASE 2687 WHEN table_variants.INFO NOT IN ('','.') 2688 AND table_vcf.INFO NOT IN ('','.') 2689 THEN ';' 2690 ELSE '' 2691 END || 2692 CASE 2693 WHEN table_vcf.INFO NOT IN ('','.') 2694 THEN table_vcf.INFO 2695 ELSE '' 2696 END 2697 FROM {table_vcf} as table_vcf 2698 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2699 AND table_vcf.\"POS\" = table_variants.\"POS\" 2700 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2701 AND table_vcf.\"REF\" = table_variants.\"REF\" 2702 ) 2703 """ 2704 self.conn.execute(sql_query_update) 2705 2706 # Drop temporary table 2707 sql_drop = f"DROP TABLE {table_vcf}" 2708 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2710 def drop_variants_table(self) -> None: 2711 """ 2712 > This function drops the variants table 2713 """ 2714 2715 table_variants = self.get_table_variants() 2716 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2717 self.conn.execute(sql_table_variants)
This function drops the variants table
2719 def set_variant_id( 2720 self, variant_id_column: str = "variant_id", force: bool = None 2721 ) -> str: 2722 """ 2723 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2724 `#CHROM`, `POS`, `REF`, and `ALT` columns 2725 2726 :param variant_id_column: The name of the column to be created in the variants table, defaults 2727 to variant_id 2728 :type variant_id_column: str (optional) 2729 :param force: If True, the variant_id column will be created even if it already exists 2730 :type force: bool 2731 :return: The name of the column that contains the variant_id 2732 """ 2733 2734 # Assembly 2735 assembly = self.get_param().get( 2736 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2737 ) 2738 2739 # INFO/Tag prefix 2740 prefix = self.get_explode_infos_prefix() 2741 2742 # Explode INFO/SVTYPE 2743 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2744 2745 # variants table 2746 table_variants = self.get_table_variants() 2747 2748 # variant_id column 2749 if not variant_id_column: 2750 variant_id_column = "variant_id" 2751 2752 # Creta variant_id column 2753 if "variant_id" not in self.get_extra_infos() or force: 2754 2755 # Create column 2756 self.add_column( 2757 table_name=table_variants, 2758 column_name=variant_id_column, 2759 column_type="UBIGINT", 2760 default_value="0", 2761 ) 2762 2763 # Update column 2764 self.conn.execute( 2765 f""" 2766 UPDATE {table_variants} 2767 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2768 """ 2769 ) 2770 2771 # Remove added columns 2772 for added_column in added_columns: 2773 self.drop_column(column=added_column) 2774 2775 # return variant_id column name 2776 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2778 def get_variant_id_column( 2779 self, variant_id_column: str = "variant_id", force: bool = None 2780 ) -> str: 2781 """ 2782 This function returns the variant_id column name 2783 2784 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2785 defaults to variant_id 2786 :type variant_id_column: str (optional) 2787 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2788 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2789 if it is not already set, or if it is set 2790 :type force: bool 2791 :return: The variant_id column name. 2792 """ 2793 2794 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2800 def scan_databases( 2801 self, 2802 database_formats: list = ["parquet"], 2803 database_releases: list = ["current"], 2804 ) -> dict: 2805 """ 2806 The function `scan_databases` scans for available databases based on specified formats and 2807 releases. 2808 2809 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2810 of the databases to be scanned. In this case, the accepted format is "parquet" 2811 :type database_formats: list ["parquet"] 2812 :param database_releases: The `database_releases` parameter is a list that specifies the 2813 releases of the databases to be scanned. In the provided function, the default value for 2814 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2815 databases that are in the "current" 2816 :type database_releases: list 2817 :return: The function `scan_databases` returns a dictionary containing information about 2818 databases that match the specified formats and releases. 2819 """ 2820 2821 # Config 2822 config = self.get_config() 2823 2824 # Param 2825 param = self.get_param() 2826 2827 # Param - Assembly 2828 assembly = param.get("assembly", config.get("assembly", None)) 2829 if not assembly: 2830 assembly = DEFAULT_ASSEMBLY 2831 log.warning(f"Default assembly '{assembly}'") 2832 2833 # Scan for availabled databases 2834 log.info( 2835 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2836 ) 2837 databases_infos_dict = databases_infos( 2838 database_folder_releases=database_releases, 2839 database_formats=database_formats, 2840 assembly=assembly, 2841 config=config, 2842 ) 2843 log.info( 2844 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2845 ) 2846 2847 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2849 def annotation(self) -> None: 2850 """ 2851 It annotates the VCF file with the annotations specified in the config file. 2852 """ 2853 2854 # Config 2855 config = self.get_config() 2856 2857 # Param 2858 param = self.get_param() 2859 2860 # Param - Assembly 2861 assembly = param.get("assembly", config.get("assembly", None)) 2862 if not assembly: 2863 assembly = DEFAULT_ASSEMBLY 2864 log.warning(f"Default assembly '{assembly}'") 2865 2866 # annotations databases folders 2867 annotations_databases = set( 2868 config.get("folders", {}) 2869 .get("databases", {}) 2870 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2871 + config.get("folders", {}) 2872 .get("databases", {}) 2873 .get("parquet", ["~/howard/databases/parquet/current"]) 2874 + config.get("folders", {}) 2875 .get("databases", {}) 2876 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2877 ) 2878 2879 # Get param annotations 2880 if param.get("annotations", None) and isinstance( 2881 param.get("annotations", None), str 2882 ): 2883 log.debug(param.get("annotations", None)) 2884 param_annotation_list = param.get("annotations").split(",") 2885 else: 2886 param_annotation_list = [] 2887 2888 # Each tools param 2889 if param.get("annotation_parquet", None) != None: 2890 log.debug( 2891 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2892 ) 2893 if isinstance(param.get("annotation_parquet", None), list): 2894 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2895 else: 2896 param_annotation_list.append(param.get("annotation_parquet")) 2897 if param.get("annotation_snpsift", None) != None: 2898 if isinstance(param.get("annotation_snpsift", None), list): 2899 param_annotation_list.append( 2900 "snpsift:" 2901 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2902 ) 2903 else: 2904 param_annotation_list.append( 2905 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2906 ) 2907 if param.get("annotation_snpeff", None) != None: 2908 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2909 if param.get("annotation_bcftools", None) != None: 2910 if isinstance(param.get("annotation_bcftools", None), list): 2911 param_annotation_list.append( 2912 "bcftools:" 2913 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2914 ) 2915 else: 2916 param_annotation_list.append( 2917 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2918 ) 2919 if param.get("annotation_annovar", None) != None: 2920 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2921 if param.get("annotation_exomiser", None) != None: 2922 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2923 if param.get("annotation_splice", None) != None: 2924 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2925 2926 # Merge param annotations list 2927 param["annotations"] = ",".join(param_annotation_list) 2928 2929 # debug 2930 log.debug(f"param_annotations={param['annotations']}") 2931 2932 if param.get("annotations"): 2933 2934 # Log 2935 # log.info("Annotations - Check annotation parameters") 2936 2937 if not "annotation" in param: 2938 param["annotation"] = {} 2939 2940 # List of annotations parameters 2941 annotations_list_input = {} 2942 if isinstance(param.get("annotations", None), str): 2943 annotation_file_list = [ 2944 value for value in param.get("annotations", "").split(",") 2945 ] 2946 for annotation_file in annotation_file_list: 2947 annotations_list_input[annotation_file] = {"INFO": None} 2948 else: 2949 annotations_list_input = param.get("annotations", {}) 2950 2951 log.info(f"Quick Annotations:") 2952 for annotation_key in list(annotations_list_input.keys()): 2953 log.info(f" {annotation_key}") 2954 2955 # List of annotations and associated fields 2956 annotations_list = {} 2957 2958 for annotation_file in annotations_list_input: 2959 2960 # Explode annotations if ALL 2961 if ( 2962 annotation_file.upper() == "ALL" 2963 or annotation_file.upper().startswith("ALL:") 2964 ): 2965 2966 # check ALL parameters (formats, releases) 2967 annotation_file_split = annotation_file.split(":") 2968 database_formats = "parquet" 2969 database_releases = "current" 2970 for annotation_file_option in annotation_file_split[1:]: 2971 database_all_options_split = annotation_file_option.split("=") 2972 if database_all_options_split[0] == "format": 2973 database_formats = database_all_options_split[1].split("+") 2974 if database_all_options_split[0] == "release": 2975 database_releases = database_all_options_split[1].split("+") 2976 2977 # Scan for availabled databases 2978 databases_infos_dict = self.scan_databases( 2979 database_formats=database_formats, 2980 database_releases=database_releases, 2981 ) 2982 2983 # Add found databases in annotation parameters 2984 for database_infos in databases_infos_dict.keys(): 2985 annotations_list[database_infos] = {"INFO": None} 2986 2987 else: 2988 annotations_list[annotation_file] = annotations_list_input[ 2989 annotation_file 2990 ] 2991 2992 # Check each databases 2993 if len(annotations_list): 2994 2995 log.info( 2996 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2997 ) 2998 2999 for annotation_file in annotations_list: 3000 3001 # Init 3002 annotations = annotations_list.get(annotation_file, None) 3003 3004 # Annotation snpEff 3005 if annotation_file.startswith("snpeff"): 3006 3007 log.debug(f"Quick Annotation snpEff") 3008 3009 if "snpeff" not in param["annotation"]: 3010 param["annotation"]["snpeff"] = {} 3011 3012 if "options" not in param["annotation"]["snpeff"]: 3013 param["annotation"]["snpeff"]["options"] = "" 3014 3015 # snpEff options in annotations 3016 param["annotation"]["snpeff"]["options"] = "".join( 3017 annotation_file.split(":")[1:] 3018 ) 3019 3020 # Annotation Annovar 3021 elif annotation_file.startswith("annovar"): 3022 3023 log.debug(f"Quick Annotation Annovar") 3024 3025 if "annovar" not in param["annotation"]: 3026 param["annotation"]["annovar"] = {} 3027 3028 if "annotations" not in param["annotation"]["annovar"]: 3029 param["annotation"]["annovar"]["annotations"] = {} 3030 3031 # Options 3032 annotation_file_split = annotation_file.split(":") 3033 for annotation_file_annotation in annotation_file_split[1:]: 3034 if annotation_file_annotation: 3035 param["annotation"]["annovar"]["annotations"][ 3036 annotation_file_annotation 3037 ] = annotations 3038 3039 # Annotation Exomiser 3040 elif annotation_file.startswith("exomiser"): 3041 3042 log.debug(f"Quick Annotation Exomiser") 3043 3044 param["annotation"]["exomiser"] = params_string_to_dict( 3045 annotation_file 3046 ) 3047 3048 # Annotation Splice 3049 elif annotation_file.startswith("splice"): 3050 3051 log.debug(f"Quick Annotation Splice") 3052 3053 param["annotation"]["splice"] = params_string_to_dict( 3054 annotation_file 3055 ) 3056 3057 # Annotation Parquet or BCFTOOLS 3058 else: 3059 3060 # Tools detection 3061 if annotation_file.startswith("bcftools:"): 3062 annotation_tool_initial = "bcftools" 3063 annotation_file = ":".join(annotation_file.split(":")[1:]) 3064 elif annotation_file.startswith("snpsift:"): 3065 annotation_tool_initial = "snpsift" 3066 annotation_file = ":".join(annotation_file.split(":")[1:]) 3067 else: 3068 annotation_tool_initial = None 3069 3070 # list of files 3071 annotation_file_list = annotation_file.replace("+", ":").split( 3072 ":" 3073 ) 3074 3075 for annotation_file in annotation_file_list: 3076 3077 if annotation_file: 3078 3079 # Annotation tool initial 3080 annotation_tool = annotation_tool_initial 3081 3082 # Find file 3083 annotation_file_found = None 3084 3085 # Expand user 3086 annotation_file = full_path(annotation_file) 3087 3088 if os.path.exists(annotation_file): 3089 annotation_file_found = annotation_file 3090 3091 else: 3092 # Find within assembly folders 3093 for annotations_database in annotations_databases: 3094 found_files = find_all( 3095 annotation_file, 3096 os.path.join( 3097 annotations_database, assembly 3098 ), 3099 ) 3100 if len(found_files) > 0: 3101 annotation_file_found = found_files[0] 3102 break 3103 if not annotation_file_found and not assembly: 3104 # Find within folders 3105 for ( 3106 annotations_database 3107 ) in annotations_databases: 3108 found_files = find_all( 3109 annotation_file, annotations_database 3110 ) 3111 if len(found_files) > 0: 3112 annotation_file_found = found_files[0] 3113 break 3114 log.debug( 3115 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3116 ) 3117 3118 # Full path 3119 annotation_file_found = full_path(annotation_file_found) 3120 3121 if annotation_file_found: 3122 3123 database = Database(database=annotation_file_found) 3124 quick_annotation_format = database.get_format() 3125 quick_annotation_is_compressed = ( 3126 database.is_compressed() 3127 ) 3128 quick_annotation_is_indexed = os.path.exists( 3129 f"{annotation_file_found}.tbi" 3130 ) 3131 bcftools_preference = False 3132 3133 # Check Annotation Tool 3134 if not annotation_tool: 3135 if ( 3136 bcftools_preference 3137 and quick_annotation_format 3138 in ["vcf", "bed"] 3139 and quick_annotation_is_compressed 3140 and quick_annotation_is_indexed 3141 ): 3142 annotation_tool = "bcftools" 3143 elif quick_annotation_format in [ 3144 "vcf", 3145 "bed", 3146 "tsv", 3147 "tsv", 3148 "csv", 3149 "json", 3150 "tbl", 3151 "parquet", 3152 "duckdb", 3153 ]: 3154 annotation_tool = "parquet" 3155 else: 3156 log.error( 3157 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3158 ) 3159 raise ValueError( 3160 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3161 ) 3162 3163 log.debug( 3164 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3165 ) 3166 3167 # Annotation Tool dispatch 3168 if annotation_tool: 3169 if annotation_tool not in param["annotation"]: 3170 param["annotation"][annotation_tool] = {} 3171 if ( 3172 "annotations" 3173 not in param["annotation"][annotation_tool] 3174 ): 3175 param["annotation"][annotation_tool][ 3176 "annotations" 3177 ] = {} 3178 param["annotation"][annotation_tool][ 3179 "annotations" 3180 ][annotation_file_found] = annotations 3181 3182 else: 3183 log.error( 3184 f"Quick Annotation File {annotation_file} does NOT exist" 3185 ) 3186 3187 self.set_param(param) 3188 3189 if param.get("annotation", None): 3190 log.info("Annotations") 3191 if param.get("annotation", {}).get("parquet", None): 3192 log.info("Annotations 'parquet'...") 3193 self.annotation_parquet() 3194 if param.get("annotation", {}).get("bcftools", None): 3195 log.info("Annotations 'bcftools'...") 3196 self.annotation_bcftools() 3197 if param.get("annotation", {}).get("snpsift", None): 3198 log.info("Annotations 'snpsift'...") 3199 self.annotation_snpsift() 3200 if param.get("annotation", {}).get("annovar", None): 3201 log.info("Annotations 'annovar'...") 3202 self.annotation_annovar() 3203 if param.get("annotation", {}).get("snpeff", None): 3204 log.info("Annotations 'snpeff'...") 3205 self.annotation_snpeff() 3206 if param.get("annotation", {}).get("exomiser", None) is not None: 3207 log.info("Annotations 'exomiser'...") 3208 self.annotation_exomiser() 3209 if param.get("annotation", {}).get("splice", None) is not None: 3210 log.info("Annotations 'splice' ...") 3211 self.annotation_splice() 3212 3213 # Explode INFOS fields into table fields 3214 if self.get_explode_infos(): 3215 self.explode_infos( 3216 prefix=self.get_explode_infos_prefix(), 3217 fields=self.get_explode_infos_fields(), 3218 force=True, 3219 )
It annotates the VCF file with the annotations specified in the config file.
3221 def annotation_snpsift(self, threads: int = None) -> None: 3222 """ 3223 This function annotate with bcftools 3224 3225 :param threads: Number of threads to use 3226 :return: the value of the variable "return_value". 3227 """ 3228 3229 # DEBUG 3230 log.debug("Start annotation with bcftools databases") 3231 3232 # Threads 3233 if not threads: 3234 threads = self.get_threads() 3235 log.debug("Threads: " + str(threads)) 3236 3237 # Config 3238 config = self.get_config() 3239 log.debug("Config: " + str(config)) 3240 3241 # Config - snpSift 3242 snpsift_bin_command = get_bin_command( 3243 bin="SnpSift.jar", 3244 tool="snpsift", 3245 bin_type="jar", 3246 config=config, 3247 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3248 ) 3249 if not snpsift_bin_command: 3250 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3251 log.error(msg_err) 3252 raise ValueError(msg_err) 3253 3254 # Config - bcftools 3255 bcftools_bin_command = get_bin_command( 3256 bin="bcftools", 3257 tool="bcftools", 3258 bin_type="bin", 3259 config=config, 3260 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3261 ) 3262 if not bcftools_bin_command: 3263 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3264 log.error(msg_err) 3265 raise ValueError(msg_err) 3266 3267 # Config - BCFTools databases folders 3268 databases_folders = set( 3269 self.get_config() 3270 .get("folders", {}) 3271 .get("databases", {}) 3272 .get("annotations", ["."]) 3273 + self.get_config() 3274 .get("folders", {}) 3275 .get("databases", {}) 3276 .get("bcftools", ["."]) 3277 ) 3278 log.debug("Databases annotations: " + str(databases_folders)) 3279 3280 # Param 3281 annotations = ( 3282 self.get_param() 3283 .get("annotation", {}) 3284 .get("snpsift", {}) 3285 .get("annotations", None) 3286 ) 3287 log.debug("Annotations: " + str(annotations)) 3288 3289 # Assembly 3290 assembly = self.get_param().get( 3291 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3292 ) 3293 3294 # Data 3295 table_variants = self.get_table_variants() 3296 3297 # Check if not empty 3298 log.debug("Check if not empty") 3299 sql_query_chromosomes = ( 3300 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3301 ) 3302 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3303 if not sql_query_chromosomes_df["count"][0]: 3304 log.info(f"VCF empty") 3305 return 3306 3307 # VCF header 3308 vcf_reader = self.get_header() 3309 log.debug("Initial header: " + str(vcf_reader.infos)) 3310 3311 # Existing annotations 3312 for vcf_annotation in self.get_header().infos: 3313 3314 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3315 log.debug( 3316 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3317 ) 3318 3319 if annotations: 3320 3321 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3322 3323 # Export VCF file 3324 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3325 3326 # Init 3327 commands = {} 3328 3329 for annotation in annotations: 3330 annotation_fields = annotations[annotation] 3331 3332 # Annotation Name 3333 annotation_name = os.path.basename(annotation) 3334 3335 if not annotation_fields: 3336 annotation_fields = {"INFO": None} 3337 3338 log.debug(f"Annotation '{annotation_name}'") 3339 log.debug( 3340 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3341 ) 3342 3343 # Create Database 3344 database = Database( 3345 database=annotation, 3346 databases_folders=databases_folders, 3347 assembly=assembly, 3348 ) 3349 3350 # Find files 3351 db_file = database.get_database() 3352 db_file = full_path(db_file) 3353 db_hdr_file = database.get_header_file() 3354 db_hdr_file = full_path(db_hdr_file) 3355 db_file_type = database.get_format() 3356 db_tbi_file = f"{db_file}.tbi" 3357 db_file_compressed = database.is_compressed() 3358 3359 # Check if compressed 3360 if not db_file_compressed: 3361 log.error( 3362 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3363 ) 3364 raise ValueError( 3365 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3366 ) 3367 3368 # Check if indexed 3369 if not os.path.exists(db_tbi_file): 3370 log.error( 3371 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3372 ) 3373 raise ValueError( 3374 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3375 ) 3376 3377 # Check index - try to create if not exists 3378 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3379 log.error("Annotation failed: database not valid") 3380 log.error(f"Annotation annotation file: {db_file}") 3381 log.error(f"Annotation annotation header: {db_hdr_file}") 3382 log.error(f"Annotation annotation index: {db_tbi_file}") 3383 raise ValueError( 3384 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3385 ) 3386 else: 3387 3388 log.debug( 3389 f"Annotation '{annotation}' - file: " 3390 + str(db_file) 3391 + " and " 3392 + str(db_hdr_file) 3393 ) 3394 3395 # Load header as VCF object 3396 db_hdr_vcf = Variants(input=db_hdr_file) 3397 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3398 log.debug( 3399 "Annotation database header: " 3400 + str(db_hdr_vcf_header_infos) 3401 ) 3402 3403 # For all fields in database 3404 annotation_fields_full = False 3405 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3406 annotation_fields = { 3407 key: key for key in db_hdr_vcf_header_infos 3408 } 3409 log.debug( 3410 "Annotation database header - All annotations added: " 3411 + str(annotation_fields) 3412 ) 3413 annotation_fields_full = True 3414 3415 # # Create file for field rename 3416 # log.debug("Create file for field rename") 3417 # tmp_rename = NamedTemporaryFile( 3418 # prefix=self.get_prefix(), 3419 # dir=self.get_tmp_dir(), 3420 # suffix=".rename", 3421 # delete=False, 3422 # ) 3423 # tmp_rename_name = tmp_rename.name 3424 # tmp_files.append(tmp_rename_name) 3425 3426 # Number of fields 3427 nb_annotation_field = 0 3428 annotation_list = [] 3429 annotation_infos_rename_list = [] 3430 3431 for annotation_field in annotation_fields: 3432 3433 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3434 annotation_fields_new_name = annotation_fields.get( 3435 annotation_field, annotation_field 3436 ) 3437 if not annotation_fields_new_name: 3438 annotation_fields_new_name = annotation_field 3439 3440 # Check if field is in DB and if field is not elready in input data 3441 if ( 3442 annotation_field in db_hdr_vcf.get_header().infos 3443 and annotation_fields_new_name 3444 not in self.get_header().infos 3445 ): 3446 3447 log.info( 3448 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3449 ) 3450 3451 # BCFTools annotate param to rename fields 3452 if annotation_field != annotation_fields_new_name: 3453 annotation_infos_rename_list.append( 3454 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3455 ) 3456 3457 # Add INFO field to header 3458 db_hdr_vcf_header_infos_number = ( 3459 db_hdr_vcf_header_infos[annotation_field].num or "." 3460 ) 3461 db_hdr_vcf_header_infos_type = ( 3462 db_hdr_vcf_header_infos[annotation_field].type 3463 or "String" 3464 ) 3465 db_hdr_vcf_header_infos_description = ( 3466 db_hdr_vcf_header_infos[annotation_field].desc 3467 or f"{annotation_field} description" 3468 ) 3469 db_hdr_vcf_header_infos_source = ( 3470 db_hdr_vcf_header_infos[annotation_field].source 3471 or "unknown" 3472 ) 3473 db_hdr_vcf_header_infos_version = ( 3474 db_hdr_vcf_header_infos[annotation_field].version 3475 or "unknown" 3476 ) 3477 3478 vcf_reader.infos[annotation_fields_new_name] = ( 3479 vcf.parser._Info( 3480 annotation_fields_new_name, 3481 db_hdr_vcf_header_infos_number, 3482 db_hdr_vcf_header_infos_type, 3483 db_hdr_vcf_header_infos_description, 3484 db_hdr_vcf_header_infos_source, 3485 db_hdr_vcf_header_infos_version, 3486 self.code_type_map[ 3487 db_hdr_vcf_header_infos_type 3488 ], 3489 ) 3490 ) 3491 3492 annotation_list.append(annotation_field) 3493 3494 nb_annotation_field += 1 3495 3496 else: 3497 3498 if ( 3499 annotation_field 3500 not in db_hdr_vcf.get_header().infos 3501 ): 3502 log.warning( 3503 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3504 ) 3505 if ( 3506 annotation_fields_new_name 3507 in self.get_header().infos 3508 ): 3509 log.warning( 3510 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3511 ) 3512 3513 log.info( 3514 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3515 ) 3516 3517 annotation_infos = ",".join(annotation_list) 3518 3519 if annotation_infos != "": 3520 3521 # Annotated VCF (and error file) 3522 tmp_annotation_vcf_name = os.path.join( 3523 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3524 ) 3525 tmp_annotation_vcf_name_err = ( 3526 tmp_annotation_vcf_name + ".err" 3527 ) 3528 3529 # Add fields to annotate 3530 if not annotation_fields_full: 3531 annotation_infos_option = f"-info {annotation_infos}" 3532 else: 3533 annotation_infos_option = "" 3534 3535 # Info fields rename 3536 if annotation_infos_rename_list: 3537 annotation_infos_rename = " -c " + ",".join( 3538 annotation_infos_rename_list 3539 ) 3540 else: 3541 annotation_infos_rename = "" 3542 3543 # Annotate command 3544 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3545 3546 # Add command 3547 commands[command_annotate] = tmp_annotation_vcf_name 3548 3549 if commands: 3550 3551 # Export VCF file 3552 self.export_variant_vcf( 3553 vcf_file=tmp_vcf_name, 3554 remove_info=True, 3555 add_samples=False, 3556 index=True, 3557 ) 3558 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3559 3560 # Num command 3561 nb_command = 0 3562 3563 # Annotate 3564 for command_annotate in commands: 3565 nb_command += 1 3566 log.info( 3567 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3568 ) 3569 log.debug(f"command_annotate={command_annotate}") 3570 run_parallel_commands([command_annotate], threads) 3571 3572 # Debug 3573 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3574 3575 # Update variants 3576 log.info( 3577 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3578 ) 3579 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3581 def annotation_bcftools(self, threads: int = None) -> None: 3582 """ 3583 This function annotate with bcftools 3584 3585 :param threads: Number of threads to use 3586 :return: the value of the variable "return_value". 3587 """ 3588 3589 # DEBUG 3590 log.debug("Start annotation with bcftools databases") 3591 3592 # Threads 3593 if not threads: 3594 threads = self.get_threads() 3595 log.debug("Threads: " + str(threads)) 3596 3597 # Config 3598 config = self.get_config() 3599 log.debug("Config: " + str(config)) 3600 3601 # DEBUG 3602 delete_tmp = True 3603 if self.get_config().get("verbosity", "warning") in ["debug"]: 3604 delete_tmp = False 3605 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3606 3607 # Config - BCFTools bin command 3608 bcftools_bin_command = get_bin_command( 3609 bin="bcftools", 3610 tool="bcftools", 3611 bin_type="bin", 3612 config=config, 3613 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3614 ) 3615 if not bcftools_bin_command: 3616 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3617 log.error(msg_err) 3618 raise ValueError(msg_err) 3619 3620 # Config - BCFTools databases folders 3621 databases_folders = set( 3622 self.get_config() 3623 .get("folders", {}) 3624 .get("databases", {}) 3625 .get("annotations", ["."]) 3626 + self.get_config() 3627 .get("folders", {}) 3628 .get("databases", {}) 3629 .get("bcftools", ["."]) 3630 ) 3631 log.debug("Databases annotations: " + str(databases_folders)) 3632 3633 # Param 3634 annotations = ( 3635 self.get_param() 3636 .get("annotation", {}) 3637 .get("bcftools", {}) 3638 .get("annotations", None) 3639 ) 3640 log.debug("Annotations: " + str(annotations)) 3641 3642 # Assembly 3643 assembly = self.get_param().get( 3644 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3645 ) 3646 3647 # Data 3648 table_variants = self.get_table_variants() 3649 3650 # Check if not empty 3651 log.debug("Check if not empty") 3652 sql_query_chromosomes = ( 3653 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3654 ) 3655 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3656 if not sql_query_chromosomes_df["count"][0]: 3657 log.info(f"VCF empty") 3658 return 3659 3660 # Export in VCF 3661 log.debug("Create initial file to annotate") 3662 tmp_vcf = NamedTemporaryFile( 3663 prefix=self.get_prefix(), 3664 dir=self.get_tmp_dir(), 3665 suffix=".vcf.gz", 3666 delete=False, 3667 ) 3668 tmp_vcf_name = tmp_vcf.name 3669 3670 # VCF header 3671 vcf_reader = self.get_header() 3672 log.debug("Initial header: " + str(vcf_reader.infos)) 3673 3674 # Existing annotations 3675 for vcf_annotation in self.get_header().infos: 3676 3677 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3678 log.debug( 3679 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3680 ) 3681 3682 if annotations: 3683 3684 tmp_ann_vcf_list = [] 3685 commands = [] 3686 tmp_files = [] 3687 err_files = [] 3688 3689 for annotation in annotations: 3690 annotation_fields = annotations[annotation] 3691 3692 # Annotation Name 3693 annotation_name = os.path.basename(annotation) 3694 3695 if not annotation_fields: 3696 annotation_fields = {"INFO": None} 3697 3698 log.debug(f"Annotation '{annotation_name}'") 3699 log.debug( 3700 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3701 ) 3702 3703 # Create Database 3704 database = Database( 3705 database=annotation, 3706 databases_folders=databases_folders, 3707 assembly=assembly, 3708 ) 3709 3710 # Find files 3711 db_file = database.get_database() 3712 db_file = full_path(db_file) 3713 db_hdr_file = database.get_header_file() 3714 db_hdr_file = full_path(db_hdr_file) 3715 db_file_type = database.get_format() 3716 db_tbi_file = f"{db_file}.tbi" 3717 db_file_compressed = database.is_compressed() 3718 3719 # Check if compressed 3720 if not db_file_compressed: 3721 log.error( 3722 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3723 ) 3724 raise ValueError( 3725 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3726 ) 3727 3728 # Check if indexed 3729 if not os.path.exists(db_tbi_file): 3730 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3731 raise ValueError( 3732 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3733 ) 3734 3735 # Check index - try to create if not exists 3736 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3737 log.error("Annotation failed: database not valid") 3738 log.error(f"Annotation annotation file: {db_file}") 3739 log.error(f"Annotation annotation header: {db_hdr_file}") 3740 log.error(f"Annotation annotation index: {db_tbi_file}") 3741 raise ValueError( 3742 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3743 ) 3744 else: 3745 3746 log.debug( 3747 f"Annotation '{annotation}' - file: " 3748 + str(db_file) 3749 + " and " 3750 + str(db_hdr_file) 3751 ) 3752 3753 # Load header as VCF object 3754 db_hdr_vcf = Variants(input=db_hdr_file) 3755 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3756 log.debug( 3757 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3758 ) 3759 3760 # For all fields in database 3761 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3762 annotation_fields = { 3763 key: key for key in db_hdr_vcf_header_infos 3764 } 3765 log.debug( 3766 "Annotation database header - All annotations added: " 3767 + str(annotation_fields) 3768 ) 3769 3770 # Number of fields 3771 nb_annotation_field = 0 3772 annotation_list = [] 3773 3774 for annotation_field in annotation_fields: 3775 3776 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3777 annotation_fields_new_name = annotation_fields.get( 3778 annotation_field, annotation_field 3779 ) 3780 if not annotation_fields_new_name: 3781 annotation_fields_new_name = annotation_field 3782 3783 # Check if field is in DB and if field is not elready in input data 3784 if ( 3785 annotation_field in db_hdr_vcf.get_header().infos 3786 and annotation_fields_new_name 3787 not in self.get_header().infos 3788 ): 3789 3790 log.info( 3791 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3792 ) 3793 3794 # Add INFO field to header 3795 db_hdr_vcf_header_infos_number = ( 3796 db_hdr_vcf_header_infos[annotation_field].num or "." 3797 ) 3798 db_hdr_vcf_header_infos_type = ( 3799 db_hdr_vcf_header_infos[annotation_field].type 3800 or "String" 3801 ) 3802 db_hdr_vcf_header_infos_description = ( 3803 db_hdr_vcf_header_infos[annotation_field].desc 3804 or f"{annotation_field} description" 3805 ) 3806 db_hdr_vcf_header_infos_source = ( 3807 db_hdr_vcf_header_infos[annotation_field].source 3808 or "unknown" 3809 ) 3810 db_hdr_vcf_header_infos_version = ( 3811 db_hdr_vcf_header_infos[annotation_field].version 3812 or "unknown" 3813 ) 3814 3815 vcf_reader.infos[annotation_fields_new_name] = ( 3816 vcf.parser._Info( 3817 annotation_fields_new_name, 3818 db_hdr_vcf_header_infos_number, 3819 db_hdr_vcf_header_infos_type, 3820 db_hdr_vcf_header_infos_description, 3821 db_hdr_vcf_header_infos_source, 3822 db_hdr_vcf_header_infos_version, 3823 self.code_type_map[db_hdr_vcf_header_infos_type], 3824 ) 3825 ) 3826 3827 # annotation_list.append(annotation_field) 3828 if annotation_field != annotation_fields_new_name: 3829 annotation_list.append( 3830 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3831 ) 3832 else: 3833 annotation_list.append(annotation_field) 3834 3835 nb_annotation_field += 1 3836 3837 else: 3838 3839 if annotation_field not in db_hdr_vcf.get_header().infos: 3840 log.warning( 3841 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3842 ) 3843 if annotation_fields_new_name in self.get_header().infos: 3844 log.warning( 3845 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3846 ) 3847 3848 log.info( 3849 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3850 ) 3851 3852 annotation_infos = ",".join(annotation_list) 3853 3854 if annotation_infos != "": 3855 3856 # Protect header for bcftools (remove "#CHROM" and variants line) 3857 log.debug("Protect Header file - remove #CHROM line if exists") 3858 tmp_header_vcf = NamedTemporaryFile( 3859 prefix=self.get_prefix(), 3860 dir=self.get_tmp_dir(), 3861 suffix=".hdr", 3862 delete=False, 3863 ) 3864 tmp_header_vcf_name = tmp_header_vcf.name 3865 tmp_files.append(tmp_header_vcf_name) 3866 # Command 3867 if db_hdr_file.endswith(".gz"): 3868 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3869 else: 3870 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3871 # Run 3872 run_parallel_commands([command_extract_header], 1) 3873 3874 # Find chomosomes 3875 log.debug("Find chromosomes ") 3876 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3877 sql_query_chromosomes_df = self.get_query_to_df( 3878 sql_query_chromosomes 3879 ) 3880 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3881 3882 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3883 3884 # BED columns in the annotation file 3885 if db_file_type in ["bed"]: 3886 annotation_infos = "CHROM,POS,POS," + annotation_infos 3887 3888 for chrom in chomosomes_list: 3889 3890 # Create BED on initial VCF 3891 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3892 tmp_bed = NamedTemporaryFile( 3893 prefix=self.get_prefix(), 3894 dir=self.get_tmp_dir(), 3895 suffix=".bed", 3896 delete=False, 3897 ) 3898 tmp_bed_name = tmp_bed.name 3899 tmp_files.append(tmp_bed_name) 3900 3901 # Detecte regions 3902 log.debug( 3903 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3904 ) 3905 window = 1000000 3906 sql_query_intervals_for_bed = f""" 3907 SELECT \"#CHROM\", 3908 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3909 \"POS\"+{window} 3910 FROM {table_variants} as table_variants 3911 WHERE table_variants.\"#CHROM\" = '{chrom}' 3912 """ 3913 regions = self.conn.execute( 3914 sql_query_intervals_for_bed 3915 ).fetchall() 3916 merged_regions = merge_regions(regions) 3917 log.debug( 3918 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3919 ) 3920 3921 header = ["#CHROM", "START", "END"] 3922 with open(tmp_bed_name, "w") as f: 3923 # Write the header with tab delimiter 3924 f.write("\t".join(header) + "\n") 3925 for d in merged_regions: 3926 # Write each data row with tab delimiter 3927 f.write("\t".join(map(str, d)) + "\n") 3928 3929 # Tmp files 3930 tmp_annotation_vcf = NamedTemporaryFile( 3931 prefix=self.get_prefix(), 3932 dir=self.get_tmp_dir(), 3933 suffix=".vcf.gz", 3934 delete=False, 3935 ) 3936 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3937 tmp_files.append(tmp_annotation_vcf_name) 3938 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3939 tmp_annotation_vcf_name_err = ( 3940 tmp_annotation_vcf_name + ".err" 3941 ) 3942 err_files.append(tmp_annotation_vcf_name_err) 3943 3944 # Annotate Command 3945 log.debug( 3946 f"Annotation '{annotation}' - add bcftools command" 3947 ) 3948 3949 # Command 3950 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3951 3952 # Add command 3953 commands.append(command_annotate) 3954 3955 # if some commands 3956 if commands: 3957 3958 # Export VCF file 3959 self.export_variant_vcf( 3960 vcf_file=tmp_vcf_name, 3961 remove_info=True, 3962 add_samples=False, 3963 index=True, 3964 ) 3965 3966 # Threads 3967 # calculate threads for annotated commands 3968 if commands: 3969 threads_bcftools_annotate = round(threads / len(commands)) 3970 else: 3971 threads_bcftools_annotate = 1 3972 3973 if not threads_bcftools_annotate: 3974 threads_bcftools_annotate = 1 3975 3976 # Add threads option to bcftools commands 3977 if threads_bcftools_annotate > 1: 3978 commands_threaded = [] 3979 for command in commands: 3980 commands_threaded.append( 3981 command.replace( 3982 f"{bcftools_bin_command} annotate ", 3983 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3984 ) 3985 ) 3986 commands = commands_threaded 3987 3988 # Command annotation multithreading 3989 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3990 log.info( 3991 f"Annotation - Annotation multithreaded in " 3992 + str(len(commands)) 3993 + " commands" 3994 ) 3995 3996 run_parallel_commands(commands, threads) 3997 3998 # Merge 3999 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4000 4001 if tmp_ann_vcf_list_cmd: 4002 4003 # Tmp file 4004 tmp_annotate_vcf = NamedTemporaryFile( 4005 prefix=self.get_prefix(), 4006 dir=self.get_tmp_dir(), 4007 suffix=".vcf.gz", 4008 delete=True, 4009 ) 4010 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4011 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4012 err_files.append(tmp_annotate_vcf_name_err) 4013 4014 # Tmp file remove command 4015 tmp_files_remove_command = "" 4016 if tmp_files: 4017 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4018 4019 # Command merge 4020 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4021 log.info( 4022 f"Annotation - Annotation merging " 4023 + str(len(commands)) 4024 + " annotated files" 4025 ) 4026 log.debug(f"Annotation - merge command: {merge_command}") 4027 run_parallel_commands([merge_command], 1) 4028 4029 # Error messages 4030 log.info(f"Error/Warning messages:") 4031 error_message_command_all = [] 4032 error_message_command_warning = [] 4033 error_message_command_err = [] 4034 for err_file in err_files: 4035 with open(err_file, "r") as f: 4036 for line in f: 4037 message = line.strip() 4038 error_message_command_all.append(message) 4039 if line.startswith("[W::"): 4040 error_message_command_warning.append(message) 4041 if line.startswith("[E::"): 4042 error_message_command_err.append( 4043 f"{err_file}: " + message 4044 ) 4045 # log info 4046 for message in list( 4047 set(error_message_command_err + error_message_command_warning) 4048 ): 4049 log.info(f" {message}") 4050 # debug info 4051 for message in list(set(error_message_command_all)): 4052 log.debug(f" {message}") 4053 # failed 4054 if len(error_message_command_err): 4055 log.error("Annotation failed: Error in commands") 4056 raise ValueError("Annotation failed: Error in commands") 4057 4058 # Update variants 4059 log.info(f"Annotation - Updating...") 4060 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4062 def annotation_exomiser(self, threads: int = None) -> None: 4063 """ 4064 This function annotate with Exomiser 4065 4066 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4067 - "analysis" (dict/file): 4068 Full analysis dictionnary parameters (see Exomiser docs). 4069 Either a dict, or a file in JSON or YAML format. 4070 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4071 Default : None 4072 - "preset" (string): 4073 Analysis preset (available in config folder). 4074 Used if no full "analysis" is provided. 4075 Default: "exome" 4076 - "phenopacket" (dict/file): 4077 Samples and phenotipic features parameters (see Exomiser docs). 4078 Either a dict, or a file in JSON or YAML format. 4079 Default: None 4080 - "subject" (dict): 4081 Sample parameters (see Exomiser docs). 4082 Example: 4083 "subject": 4084 { 4085 "id": "ISDBM322017", 4086 "sex": "FEMALE" 4087 } 4088 Default: None 4089 - "sample" (string): 4090 Sample name to construct "subject" section: 4091 "subject": 4092 { 4093 "id": "<sample>", 4094 "sex": "UNKNOWN_SEX" 4095 } 4096 Default: None 4097 - "phenotypicFeatures" (dict) 4098 Phenotypic features to construct "subject" section. 4099 Example: 4100 "phenotypicFeatures": 4101 [ 4102 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4103 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4104 ] 4105 - "hpo" (list) 4106 List of HPO ids as phenotypic features. 4107 Example: 4108 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4109 Default: [] 4110 - "outputOptions" (dict): 4111 Output options (see Exomiser docs). 4112 Default: 4113 "output_options" = 4114 { 4115 "outputContributingVariantsOnly": False, 4116 "numGenes": 0, 4117 "outputFormats": ["TSV_VARIANT", "VCF"] 4118 } 4119 - "transcript_source" (string): 4120 Transcript source (either "refseq", "ucsc", "ensembl") 4121 Default: "refseq" 4122 - "exomiser_to_info" (boolean): 4123 Add exomiser TSV file columns as INFO fields in VCF. 4124 Default: False 4125 - "release" (string): 4126 Exomise database release. 4127 If not exists, database release will be downloaded (take a while). 4128 Default: None (provided by application.properties configuration file) 4129 - "exomiser_application_properties" (file): 4130 Exomiser configuration file (see Exomiser docs). 4131 Useful to automatically download databases (especially for specific genome databases). 4132 4133 Notes: 4134 - If no sample in parameters, first sample in VCF will be chosen 4135 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4136 4137 :param threads: The number of threads to use 4138 :return: None. 4139 """ 4140 4141 # DEBUG 4142 log.debug("Start annotation with Exomiser databases") 4143 4144 # Threads 4145 if not threads: 4146 threads = self.get_threads() 4147 log.debug("Threads: " + str(threads)) 4148 4149 # Config 4150 config = self.get_config() 4151 log.debug("Config: " + str(config)) 4152 4153 # Config - Folders - Databases 4154 databases_folders = ( 4155 config.get("folders", {}) 4156 .get("databases", {}) 4157 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4158 ) 4159 databases_folders = full_path(databases_folders) 4160 if not os.path.exists(databases_folders): 4161 log.error(f"Databases annotations: {databases_folders} NOT found") 4162 log.debug("Databases annotations: " + str(databases_folders)) 4163 4164 # Config - Exomiser 4165 exomiser_bin_command = get_bin_command( 4166 bin="exomiser-cli*.jar", 4167 tool="exomiser", 4168 bin_type="jar", 4169 config=config, 4170 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4171 ) 4172 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4173 if not exomiser_bin_command: 4174 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4175 log.error(msg_err) 4176 raise ValueError(msg_err) 4177 4178 # Param 4179 param = self.get_param() 4180 log.debug("Param: " + str(param)) 4181 4182 # Param - Exomiser 4183 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4184 log.debug(f"Param Exomiser: {param_exomiser}") 4185 4186 # Param - Assembly 4187 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4188 log.debug("Assembly: " + str(assembly)) 4189 4190 # Data 4191 table_variants = self.get_table_variants() 4192 4193 # Check if not empty 4194 log.debug("Check if not empty") 4195 sql_query_chromosomes = ( 4196 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4197 ) 4198 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4199 log.info(f"VCF empty") 4200 return False 4201 4202 # VCF header 4203 vcf_reader = self.get_header() 4204 log.debug("Initial header: " + str(vcf_reader.infos)) 4205 4206 # Samples 4207 samples = self.get_header_sample_list() 4208 if not samples: 4209 log.error("No Samples in VCF") 4210 return False 4211 log.debug(f"Samples: {samples}") 4212 4213 # Memory limit 4214 memory_limit = self.get_memory("8G") 4215 log.debug(f"memory_limit: {memory_limit}") 4216 4217 # Exomiser java options 4218 exomiser_java_options = ( 4219 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4220 ) 4221 log.debug(f"Exomiser java options: {exomiser_java_options}") 4222 4223 # Download Exomiser (if not exists) 4224 exomiser_release = param_exomiser.get("release", None) 4225 exomiser_application_properties = param_exomiser.get( 4226 "exomiser_application_properties", None 4227 ) 4228 databases_download_exomiser( 4229 assemblies=[assembly], 4230 exomiser_folder=databases_folders, 4231 exomiser_release=exomiser_release, 4232 exomiser_phenotype_release=exomiser_release, 4233 exomiser_application_properties=exomiser_application_properties, 4234 ) 4235 4236 # Force annotation 4237 force_update_annotation = True 4238 4239 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4240 log.debug("Start annotation Exomiser") 4241 4242 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4243 4244 # tmp_dir = "/tmp/exomiser" 4245 4246 ### ANALYSIS ### 4247 ################ 4248 4249 # Create analysis.json through analysis dict 4250 # either analysis in param or by default 4251 # depending on preset exome/genome) 4252 4253 # Init analysis dict 4254 param_exomiser_analysis_dict = {} 4255 4256 # analysis from param 4257 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4258 param_exomiser_analysis = full_path(param_exomiser_analysis) 4259 4260 # If analysis in param -> load anlaysis json 4261 if param_exomiser_analysis: 4262 4263 # If param analysis is a file and exists 4264 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4265 param_exomiser_analysis 4266 ): 4267 # Load analysis file into analysis dict (either yaml or json) 4268 with open(param_exomiser_analysis) as json_file: 4269 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4270 4271 # If param analysis is a dict 4272 elif isinstance(param_exomiser_analysis, dict): 4273 # Load analysis dict into analysis dict (either yaml or json) 4274 param_exomiser_analysis_dict = param_exomiser_analysis 4275 4276 # Error analysis type 4277 else: 4278 log.error(f"Analysis type unknown. Check param file.") 4279 raise ValueError(f"Analysis type unknown. Check param file.") 4280 4281 # Case no input analysis config file/dict 4282 # Use preset (exome/genome) to open default config file 4283 if not param_exomiser_analysis_dict: 4284 4285 # default preset 4286 default_preset = "exome" 4287 4288 # Get param preset or default preset 4289 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4290 4291 # Try to find if preset is a file 4292 if os.path.exists(param_exomiser_preset): 4293 # Preset file is provided in full path 4294 param_exomiser_analysis_default_config_file = ( 4295 param_exomiser_preset 4296 ) 4297 # elif os.path.exists(full_path(param_exomiser_preset)): 4298 # # Preset file is provided in full path 4299 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4300 elif os.path.exists( 4301 os.path.join(folder_config, param_exomiser_preset) 4302 ): 4303 # Preset file is provided a basename in config folder (can be a path with subfolders) 4304 param_exomiser_analysis_default_config_file = os.path.join( 4305 folder_config, param_exomiser_preset 4306 ) 4307 else: 4308 # Construct preset file 4309 param_exomiser_analysis_default_config_file = os.path.join( 4310 folder_config, 4311 f"preset-{param_exomiser_preset}-analysis.json", 4312 ) 4313 4314 # If preset file exists 4315 param_exomiser_analysis_default_config_file = full_path( 4316 param_exomiser_analysis_default_config_file 4317 ) 4318 if os.path.exists(param_exomiser_analysis_default_config_file): 4319 # Load prest file into analysis dict (either yaml or json) 4320 with open( 4321 param_exomiser_analysis_default_config_file 4322 ) as json_file: 4323 # param_exomiser_analysis_dict[""] = json.load(json_file) 4324 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4325 json_file 4326 ) 4327 4328 # Error preset file 4329 else: 4330 log.error( 4331 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4332 ) 4333 raise ValueError( 4334 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4335 ) 4336 4337 # If no analysis dict created 4338 if not param_exomiser_analysis_dict: 4339 log.error(f"No analysis config") 4340 raise ValueError(f"No analysis config") 4341 4342 # Log 4343 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4344 4345 ### PHENOPACKET ### 4346 ################### 4347 4348 # If no PhenoPacket in analysis dict -> check in param 4349 if "phenopacket" not in param_exomiser_analysis_dict: 4350 4351 # If PhenoPacket in param -> load anlaysis json 4352 if param_exomiser.get("phenopacket", None): 4353 4354 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4355 param_exomiser_phenopacket = full_path( 4356 param_exomiser_phenopacket 4357 ) 4358 4359 # If param phenopacket is a file and exists 4360 if isinstance( 4361 param_exomiser_phenopacket, str 4362 ) and os.path.exists(param_exomiser_phenopacket): 4363 # Load phenopacket file into analysis dict (either yaml or json) 4364 with open(param_exomiser_phenopacket) as json_file: 4365 param_exomiser_analysis_dict["phenopacket"] = ( 4366 yaml.safe_load(json_file) 4367 ) 4368 4369 # If param phenopacket is a dict 4370 elif isinstance(param_exomiser_phenopacket, dict): 4371 # Load phenopacket dict into analysis dict (either yaml or json) 4372 param_exomiser_analysis_dict["phenopacket"] = ( 4373 param_exomiser_phenopacket 4374 ) 4375 4376 # Error phenopacket type 4377 else: 4378 log.error(f"Phenopacket type unknown. Check param file.") 4379 raise ValueError( 4380 f"Phenopacket type unknown. Check param file." 4381 ) 4382 4383 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4384 if "phenopacket" not in param_exomiser_analysis_dict: 4385 4386 # Init PhenoPacket 4387 param_exomiser_analysis_dict["phenopacket"] = { 4388 "id": "analysis", 4389 "proband": {}, 4390 } 4391 4392 ### Add subject ### 4393 4394 # If subject exists 4395 param_exomiser_subject = param_exomiser.get("subject", {}) 4396 4397 # If subject not exists -> found sample ID 4398 if not param_exomiser_subject: 4399 4400 # Found sample ID in param 4401 sample = param_exomiser.get("sample", None) 4402 4403 # Find sample ID (first sample) 4404 if not sample: 4405 sample_list = self.get_header_sample_list() 4406 if len(sample_list) > 0: 4407 sample = sample_list[0] 4408 else: 4409 log.error(f"No sample found") 4410 raise ValueError(f"No sample found") 4411 4412 # Create subject 4413 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4414 4415 # Add to dict 4416 param_exomiser_analysis_dict["phenopacket"][ 4417 "subject" 4418 ] = param_exomiser_subject 4419 4420 ### Add "phenotypicFeatures" ### 4421 4422 # If phenotypicFeatures exists 4423 param_exomiser_phenotypicfeatures = param_exomiser.get( 4424 "phenotypicFeatures", [] 4425 ) 4426 4427 # If phenotypicFeatures not exists -> Try to infer from hpo list 4428 if not param_exomiser_phenotypicfeatures: 4429 4430 # Found HPO in param 4431 param_exomiser_hpo = param_exomiser.get("hpo", []) 4432 4433 # Split HPO if list in string format separated by comma 4434 if isinstance(param_exomiser_hpo, str): 4435 param_exomiser_hpo = param_exomiser_hpo.split(",") 4436 4437 # Create HPO list 4438 for hpo in param_exomiser_hpo: 4439 hpo_clean = re.sub("[^0-9]", "", hpo) 4440 param_exomiser_phenotypicfeatures.append( 4441 { 4442 "type": { 4443 "id": f"HP:{hpo_clean}", 4444 "label": f"HP:{hpo_clean}", 4445 } 4446 } 4447 ) 4448 4449 # Add to dict 4450 param_exomiser_analysis_dict["phenopacket"][ 4451 "phenotypicFeatures" 4452 ] = param_exomiser_phenotypicfeatures 4453 4454 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4455 if not param_exomiser_phenotypicfeatures: 4456 for step in param_exomiser_analysis_dict.get( 4457 "analysis", {} 4458 ).get("steps", []): 4459 if "hiPhivePrioritiser" in step: 4460 param_exomiser_analysis_dict.get("analysis", {}).get( 4461 "steps", [] 4462 ).remove(step) 4463 4464 ### Add Input File ### 4465 4466 # Initial file name and htsFiles 4467 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4468 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4469 { 4470 "uri": tmp_vcf_name, 4471 "htsFormat": "VCF", 4472 "genomeAssembly": assembly, 4473 } 4474 ] 4475 4476 ### Add metaData ### 4477 4478 # If metaData not in analysis dict 4479 if "metaData" not in param_exomiser_analysis_dict: 4480 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4481 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4482 "createdBy": "howard", 4483 "phenopacketSchemaVersion": 1, 4484 } 4485 4486 ### OutputOptions ### 4487 4488 # Init output result folder 4489 output_results = os.path.join(tmp_dir, "results") 4490 4491 # If no outputOptions in analysis dict 4492 if "outputOptions" not in param_exomiser_analysis_dict: 4493 4494 # default output formats 4495 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4496 4497 # Get outputOptions in param 4498 output_options = param_exomiser.get("outputOptions", None) 4499 4500 # If no output_options in param -> check 4501 if not output_options: 4502 output_options = { 4503 "outputContributingVariantsOnly": False, 4504 "numGenes": 0, 4505 "outputFormats": defaut_output_formats, 4506 } 4507 4508 # Replace outputDirectory in output options 4509 output_options["outputDirectory"] = output_results 4510 output_options["outputFileName"] = "howard" 4511 4512 # Add outputOptions in analysis dict 4513 param_exomiser_analysis_dict["outputOptions"] = output_options 4514 4515 else: 4516 4517 # Replace output_results and output format (if exists in param) 4518 param_exomiser_analysis_dict["outputOptions"][ 4519 "outputDirectory" 4520 ] = output_results 4521 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4522 list( 4523 set( 4524 param_exomiser_analysis_dict.get( 4525 "outputOptions", {} 4526 ).get("outputFormats", []) 4527 + ["TSV_VARIANT", "VCF"] 4528 ) 4529 ) 4530 ) 4531 4532 # log 4533 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4534 4535 ### ANALYSIS FILE ### 4536 ##################### 4537 4538 ### Full JSON analysis config file ### 4539 4540 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4541 with open(exomiser_analysis, "w") as fp: 4542 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4543 4544 ### SPLIT analysis and sample config files 4545 4546 # Splitted analysis dict 4547 param_exomiser_analysis_dict_for_split = ( 4548 param_exomiser_analysis_dict.copy() 4549 ) 4550 4551 # Phenopacket JSON file 4552 exomiser_analysis_phenopacket = os.path.join( 4553 tmp_dir, "analysis_phenopacket.json" 4554 ) 4555 with open(exomiser_analysis_phenopacket, "w") as fp: 4556 json.dump( 4557 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4558 fp, 4559 indent=4, 4560 ) 4561 4562 # Analysis JSON file without Phenopacket parameters 4563 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4564 exomiser_analysis_analysis = os.path.join( 4565 tmp_dir, "analysis_analysis.json" 4566 ) 4567 with open(exomiser_analysis_analysis, "w") as fp: 4568 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4569 4570 ### INITAL VCF file ### 4571 ####################### 4572 4573 ### Create list of samples to use and include inti initial VCF file #### 4574 4575 # Subject (main sample) 4576 # Get sample ID in analysis dict 4577 sample_subject = ( 4578 param_exomiser_analysis_dict.get("phenopacket", {}) 4579 .get("subject", {}) 4580 .get("id", None) 4581 ) 4582 sample_proband = ( 4583 param_exomiser_analysis_dict.get("phenopacket", {}) 4584 .get("proband", {}) 4585 .get("subject", {}) 4586 .get("id", None) 4587 ) 4588 sample = [] 4589 if sample_subject: 4590 sample.append(sample_subject) 4591 if sample_proband: 4592 sample.append(sample_proband) 4593 4594 # Get sample ID within Pedigree 4595 pedigree_persons_list = ( 4596 param_exomiser_analysis_dict.get("phenopacket", {}) 4597 .get("pedigree", {}) 4598 .get("persons", {}) 4599 ) 4600 4601 # Create list with all sample ID in pedigree (if exists) 4602 pedigree_persons = [] 4603 for person in pedigree_persons_list: 4604 pedigree_persons.append(person.get("individualId")) 4605 4606 # Concat subject sample ID and samples ID in pedigreesamples 4607 samples = list(set(sample + pedigree_persons)) 4608 4609 # Check if sample list is not empty 4610 if not samples: 4611 log.error(f"No samples found") 4612 raise ValueError(f"No samples found") 4613 4614 # Create VCF with sample (either sample in param or first one by default) 4615 # Export VCF file 4616 self.export_variant_vcf( 4617 vcf_file=tmp_vcf_name, 4618 remove_info=True, 4619 add_samples=True, 4620 list_samples=samples, 4621 index=False, 4622 ) 4623 4624 ### Execute Exomiser ### 4625 ######################## 4626 4627 # Init command 4628 exomiser_command = "" 4629 4630 # Command exomiser options 4631 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4632 4633 # Release 4634 exomiser_release = param_exomiser.get("release", None) 4635 if exomiser_release: 4636 # phenotype data version 4637 exomiser_options += ( 4638 f" --exomiser.phenotype.data-version={exomiser_release} " 4639 ) 4640 # data version 4641 exomiser_options += ( 4642 f" --exomiser.{assembly}.data-version={exomiser_release} " 4643 ) 4644 # variant white list 4645 variant_white_list_file = ( 4646 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4647 ) 4648 if os.path.exists( 4649 os.path.join( 4650 databases_folders, assembly, variant_white_list_file 4651 ) 4652 ): 4653 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4654 4655 # transcript_source 4656 transcript_source = param_exomiser.get( 4657 "transcript_source", None 4658 ) # ucsc, refseq, ensembl 4659 if transcript_source: 4660 exomiser_options += ( 4661 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4662 ) 4663 4664 # If analysis contain proband param 4665 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4666 "proband", {} 4667 ): 4668 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4669 4670 # If no proband (usually uniq sample) 4671 else: 4672 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4673 4674 # Log 4675 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4676 4677 # Run command 4678 result = subprocess.call( 4679 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4680 ) 4681 if result: 4682 log.error("Exomiser command failed") 4683 raise ValueError("Exomiser command failed") 4684 4685 ### RESULTS ### 4686 ############### 4687 4688 ### Annotate with TSV fields ### 4689 4690 # Init result tsv file 4691 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4692 4693 # Init result tsv file 4694 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4695 4696 # Parse TSV file and explode columns in INFO field 4697 if exomiser_to_info and os.path.exists(output_results_tsv): 4698 4699 # Log 4700 log.debug("Exomiser columns to VCF INFO field") 4701 4702 # Retrieve columns and types 4703 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4704 output_results_tsv_df = self.get_query_to_df(query) 4705 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4706 4707 # Init concat fields for update 4708 sql_query_update_concat_fields = [] 4709 4710 # Fields to avoid 4711 fields_to_avoid = [ 4712 "CONTIG", 4713 "START", 4714 "END", 4715 "REF", 4716 "ALT", 4717 "QUAL", 4718 "FILTER", 4719 "GENOTYPE", 4720 ] 4721 4722 # List all columns to add into header 4723 for header_column in output_results_tsv_columns: 4724 4725 # If header column is enable 4726 if header_column not in fields_to_avoid: 4727 4728 # Header info type 4729 header_info_type = "String" 4730 header_column_df = output_results_tsv_df[header_column] 4731 header_column_df_dtype = header_column_df.dtype 4732 if header_column_df_dtype == object: 4733 if ( 4734 pd.to_numeric(header_column_df, errors="coerce") 4735 .notnull() 4736 .all() 4737 ): 4738 header_info_type = "Float" 4739 else: 4740 header_info_type = "Integer" 4741 4742 # Header info 4743 characters_to_validate = ["-"] 4744 pattern = "[" + "".join(characters_to_validate) + "]" 4745 header_info_name = re.sub( 4746 pattern, 4747 "_", 4748 f"Exomiser_{header_column}".replace("#", ""), 4749 ) 4750 header_info_number = "." 4751 header_info_description = ( 4752 f"Exomiser {header_column} annotation" 4753 ) 4754 header_info_source = "Exomiser" 4755 header_info_version = "unknown" 4756 header_info_code = CODE_TYPE_MAP[header_info_type] 4757 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4758 header_info_name, 4759 header_info_number, 4760 header_info_type, 4761 header_info_description, 4762 header_info_source, 4763 header_info_version, 4764 header_info_code, 4765 ) 4766 4767 # Add field to add for update to concat fields 4768 sql_query_update_concat_fields.append( 4769 f""" 4770 CASE 4771 WHEN table_parquet."{header_column}" NOT IN ('','.') 4772 THEN concat( 4773 '{header_info_name}=', 4774 table_parquet."{header_column}", 4775 ';' 4776 ) 4777 4778 ELSE '' 4779 END 4780 """ 4781 ) 4782 4783 # Update query 4784 sql_query_update = f""" 4785 UPDATE {table_variants} as table_variants 4786 SET INFO = concat( 4787 CASE 4788 WHEN INFO NOT IN ('', '.') 4789 THEN INFO 4790 ELSE '' 4791 END, 4792 CASE 4793 WHEN table_variants.INFO NOT IN ('','.') 4794 THEN ';' 4795 ELSE '' 4796 END, 4797 ( 4798 SELECT 4799 concat( 4800 {",".join(sql_query_update_concat_fields)} 4801 ) 4802 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4803 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4804 AND table_parquet.\"START\" = table_variants.\"POS\" 4805 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4806 AND table_parquet.\"REF\" = table_variants.\"REF\" 4807 ) 4808 ) 4809 ; 4810 """ 4811 4812 # Update 4813 self.conn.execute(sql_query_update) 4814 4815 ### Annotate with VCF INFO field ### 4816 4817 # Init result VCF file 4818 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4819 4820 # If VCF exists 4821 if os.path.exists(output_results_vcf): 4822 4823 # Log 4824 log.debug("Exomiser result VCF update variants") 4825 4826 # Find Exomiser INFO field annotation in header 4827 with gzip.open(output_results_vcf, "rt") as f: 4828 header_list = self.read_vcf_header(f) 4829 exomiser_vcf_header = vcf.Reader( 4830 io.StringIO("\n".join(header_list)) 4831 ) 4832 4833 # Add annotation INFO field to header 4834 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4835 4836 # Update variants with VCF 4837 self.update_from_vcf(output_results_vcf) 4838 4839 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
4841 def annotation_snpeff(self, threads: int = None) -> None: 4842 """ 4843 This function annotate with snpEff 4844 4845 :param threads: The number of threads to use 4846 :return: the value of the variable "return_value". 4847 """ 4848 4849 # DEBUG 4850 log.debug("Start annotation with snpeff databases") 4851 4852 # Threads 4853 if not threads: 4854 threads = self.get_threads() 4855 log.debug("Threads: " + str(threads)) 4856 4857 # DEBUG 4858 delete_tmp = True 4859 if self.get_config().get("verbosity", "warning") in ["debug"]: 4860 delete_tmp = False 4861 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4862 4863 # Config 4864 config = self.get_config() 4865 log.debug("Config: " + str(config)) 4866 4867 # Config - Folders - Databases 4868 databases_folders = ( 4869 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4870 ) 4871 log.debug("Databases annotations: " + str(databases_folders)) 4872 4873 # # Config - Java 4874 # java_bin = get_bin( 4875 # tool="java", 4876 # bin="java", 4877 # bin_type="bin", 4878 # config=config, 4879 # default_folder="/usr/bin", 4880 # ) 4881 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4882 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4883 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4884 4885 # # Config - snpEff bin 4886 # snpeff_jar = get_bin( 4887 # tool="snpeff", 4888 # bin="snpEff.jar", 4889 # bin_type="jar", 4890 # config=config, 4891 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4892 # ) 4893 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4894 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4895 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4896 4897 # Config - snpEff bin command 4898 snpeff_bin_command = get_bin_command( 4899 bin="snpEff.jar", 4900 tool="snpeff", 4901 bin_type="jar", 4902 config=config, 4903 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4904 ) 4905 if not snpeff_bin_command: 4906 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4907 log.error(msg_err) 4908 raise ValueError(msg_err) 4909 4910 # Config - snpEff databases 4911 snpeff_databases = ( 4912 config.get("folders", {}) 4913 .get("databases", {}) 4914 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4915 ) 4916 snpeff_databases = full_path(snpeff_databases) 4917 if snpeff_databases is not None and snpeff_databases != "": 4918 log.debug(f"Create snpEff databases folder") 4919 if not os.path.exists(snpeff_databases): 4920 os.makedirs(snpeff_databases) 4921 4922 # Param 4923 param = self.get_param() 4924 log.debug("Param: " + str(param)) 4925 4926 # Param 4927 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4928 log.debug("Options: " + str(options)) 4929 4930 # Param - Assembly 4931 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4932 4933 # Param - Options 4934 snpeff_options = ( 4935 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4936 ) 4937 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4938 snpeff_csvstats = ( 4939 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4940 ) 4941 if snpeff_stats: 4942 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4943 snpeff_stats = full_path(snpeff_stats) 4944 snpeff_options += f" -stats {snpeff_stats}" 4945 if snpeff_csvstats: 4946 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4947 snpeff_csvstats = full_path(snpeff_csvstats) 4948 snpeff_options += f" -csvStats {snpeff_csvstats}" 4949 4950 # Data 4951 table_variants = self.get_table_variants() 4952 4953 # Check if not empty 4954 log.debug("Check if not empty") 4955 sql_query_chromosomes = ( 4956 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4957 ) 4958 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4959 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4960 log.info(f"VCF empty") 4961 return 4962 4963 # Export in VCF 4964 log.debug("Create initial file to annotate") 4965 tmp_vcf = NamedTemporaryFile( 4966 prefix=self.get_prefix(), 4967 dir=self.get_tmp_dir(), 4968 suffix=".vcf.gz", 4969 delete=True, 4970 ) 4971 tmp_vcf_name = tmp_vcf.name 4972 4973 # VCF header 4974 vcf_reader = self.get_header() 4975 log.debug("Initial header: " + str(vcf_reader.infos)) 4976 4977 # Existing annotations 4978 for vcf_annotation in self.get_header().infos: 4979 4980 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4981 log.debug( 4982 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4983 ) 4984 4985 # Memory limit 4986 # if config.get("memory", None): 4987 # memory_limit = config.get("memory", "8G") 4988 # else: 4989 # memory_limit = "8G" 4990 memory_limit = self.get_memory("8G") 4991 log.debug(f"memory_limit: {memory_limit}") 4992 4993 # snpEff java options 4994 snpeff_java_options = ( 4995 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4996 ) 4997 log.debug(f"Exomiser java options: {snpeff_java_options}") 4998 4999 force_update_annotation = True 5000 5001 if "ANN" not in self.get_header().infos or force_update_annotation: 5002 5003 # Check snpEff database 5004 log.debug(f"Check snpEff databases {[assembly]}") 5005 databases_download_snpeff( 5006 folder=snpeff_databases, assemblies=[assembly], config=config 5007 ) 5008 5009 # Export VCF file 5010 self.export_variant_vcf( 5011 vcf_file=tmp_vcf_name, 5012 remove_info=True, 5013 add_samples=False, 5014 index=True, 5015 ) 5016 5017 # Tmp file 5018 err_files = [] 5019 tmp_annotate_vcf = NamedTemporaryFile( 5020 prefix=self.get_prefix(), 5021 dir=self.get_tmp_dir(), 5022 suffix=".vcf", 5023 delete=False, 5024 ) 5025 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5026 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5027 err_files.append(tmp_annotate_vcf_name_err) 5028 5029 # Command 5030 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5031 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5032 run_parallel_commands([snpeff_command], 1) 5033 5034 # Error messages 5035 log.info(f"Error/Warning messages:") 5036 error_message_command_all = [] 5037 error_message_command_warning = [] 5038 error_message_command_err = [] 5039 for err_file in err_files: 5040 with open(err_file, "r") as f: 5041 for line in f: 5042 message = line.strip() 5043 error_message_command_all.append(message) 5044 if line.startswith("[W::"): 5045 error_message_command_warning.append(message) 5046 if line.startswith("[E::"): 5047 error_message_command_err.append(f"{err_file}: " + message) 5048 # log info 5049 for message in list( 5050 set(error_message_command_err + error_message_command_warning) 5051 ): 5052 log.info(f" {message}") 5053 # debug info 5054 for message in list(set(error_message_command_all)): 5055 log.debug(f" {message}") 5056 # failed 5057 if len(error_message_command_err): 5058 log.error("Annotation failed: Error in commands") 5059 raise ValueError("Annotation failed: Error in commands") 5060 5061 # Find annotation in header 5062 with open(tmp_annotate_vcf_name, "rt") as f: 5063 header_list = self.read_vcf_header(f) 5064 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5065 5066 for ann in annovar_vcf_header.infos: 5067 if ann not in self.get_header().infos: 5068 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5069 5070 # Update variants 5071 log.info(f"Annotation - Updating...") 5072 self.update_from_vcf(tmp_annotate_vcf_name) 5073 5074 else: 5075 if "ANN" in self.get_header().infos: 5076 log.debug(f"Existing snpEff annotations in VCF") 5077 if force_update_annotation: 5078 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5080 def annotation_annovar(self, threads: int = None) -> None: 5081 """ 5082 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5083 annotations 5084 5085 :param threads: number of threads to use 5086 :return: the value of the variable "return_value". 5087 """ 5088 5089 # DEBUG 5090 log.debug("Start annotation with Annovar databases") 5091 5092 # Threads 5093 if not threads: 5094 threads = self.get_threads() 5095 log.debug("Threads: " + str(threads)) 5096 5097 # Tmp en Err files 5098 tmp_files = [] 5099 err_files = [] 5100 5101 # DEBUG 5102 delete_tmp = True 5103 if self.get_config().get("verbosity", "warning") in ["debug"]: 5104 delete_tmp = False 5105 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5106 5107 # Config 5108 config = self.get_config() 5109 log.debug("Config: " + str(config)) 5110 5111 # Config - Folders - Databases 5112 databases_folders = ( 5113 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5114 ) 5115 log.debug("Databases annotations: " + str(databases_folders)) 5116 5117 # Config - annovar bin command 5118 annovar_bin_command = get_bin_command( 5119 bin="table_annovar.pl", 5120 tool="annovar", 5121 bin_type="perl", 5122 config=config, 5123 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5124 ) 5125 if not annovar_bin_command: 5126 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5127 log.error(msg_err) 5128 raise ValueError(msg_err) 5129 5130 # Config - BCFTools bin command 5131 bcftools_bin_command = get_bin_command( 5132 bin="bcftools", 5133 tool="bcftools", 5134 bin_type="bin", 5135 config=config, 5136 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5137 ) 5138 if not bcftools_bin_command: 5139 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5140 log.error(msg_err) 5141 raise ValueError(msg_err) 5142 5143 # Config - annovar databases 5144 annovar_databases = ( 5145 config.get("folders", {}) 5146 .get("databases", {}) 5147 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5148 ) 5149 annovar_databases = full_path(annovar_databases) 5150 if annovar_databases != "" and not os.path.exists(annovar_databases): 5151 os.makedirs(annovar_databases) 5152 5153 # Param 5154 param = self.get_param() 5155 log.debug("Param: " + str(param)) 5156 5157 # Param - options 5158 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5159 log.debug("Options: " + str(options)) 5160 5161 # Param - annotations 5162 annotations = ( 5163 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5164 ) 5165 log.debug("Annotations: " + str(annotations)) 5166 5167 # Param - Assembly 5168 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5169 5170 # Annovar database assembly 5171 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5172 if annovar_databases_assembly != "" and not os.path.exists( 5173 annovar_databases_assembly 5174 ): 5175 os.makedirs(annovar_databases_assembly) 5176 5177 # Data 5178 table_variants = self.get_table_variants() 5179 5180 # Check if not empty 5181 log.debug("Check if not empty") 5182 sql_query_chromosomes = ( 5183 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5184 ) 5185 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5186 if not sql_query_chromosomes_df["count"][0]: 5187 log.info(f"VCF empty") 5188 return 5189 5190 # VCF header 5191 vcf_reader = self.get_header() 5192 log.debug("Initial header: " + str(vcf_reader.infos)) 5193 5194 # Existing annotations 5195 for vcf_annotation in self.get_header().infos: 5196 5197 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5198 log.debug( 5199 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5200 ) 5201 5202 force_update_annotation = True 5203 5204 if annotations: 5205 5206 commands = [] 5207 tmp_annotates_vcf_name_list = [] 5208 5209 # Export in VCF 5210 log.debug("Create initial file to annotate") 5211 tmp_vcf = NamedTemporaryFile( 5212 prefix=self.get_prefix(), 5213 dir=self.get_tmp_dir(), 5214 suffix=".vcf.gz", 5215 delete=False, 5216 ) 5217 tmp_vcf_name = tmp_vcf.name 5218 tmp_files.append(tmp_vcf_name) 5219 tmp_files.append(tmp_vcf_name + ".tbi") 5220 5221 # Export VCF file 5222 self.export_variant_vcf( 5223 vcf_file=tmp_vcf_name, 5224 remove_info=".", 5225 add_samples=False, 5226 index=True, 5227 ) 5228 5229 # Create file for field rename 5230 log.debug("Create file for field rename") 5231 tmp_rename = NamedTemporaryFile( 5232 prefix=self.get_prefix(), 5233 dir=self.get_tmp_dir(), 5234 suffix=".rename", 5235 delete=False, 5236 ) 5237 tmp_rename_name = tmp_rename.name 5238 tmp_files.append(tmp_rename_name) 5239 5240 # Check Annovar database 5241 log.debug( 5242 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5243 ) 5244 databases_download_annovar( 5245 folder=annovar_databases, 5246 files=list(annotations.keys()), 5247 assemblies=[assembly], 5248 ) 5249 5250 for annotation in annotations: 5251 annotation_fields = annotations[annotation] 5252 5253 if not annotation_fields: 5254 annotation_fields = {"INFO": None} 5255 5256 log.info(f"Annotations Annovar - database '{annotation}'") 5257 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5258 5259 # Tmp file for annovar 5260 err_files = [] 5261 tmp_annotate_vcf_directory = TemporaryDirectory( 5262 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5263 ) 5264 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5265 tmp_annotate_vcf_name_annovar = ( 5266 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5267 ) 5268 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5269 err_files.append(tmp_annotate_vcf_name_err) 5270 tmp_files.append(tmp_annotate_vcf_name_err) 5271 5272 # Tmp file final vcf annotated by annovar 5273 tmp_annotate_vcf = NamedTemporaryFile( 5274 prefix=self.get_prefix(), 5275 dir=self.get_tmp_dir(), 5276 suffix=".vcf.gz", 5277 delete=False, 5278 ) 5279 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5280 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5281 tmp_files.append(tmp_annotate_vcf_name) 5282 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5283 5284 # Number of fields 5285 annotation_list = [] 5286 annotation_renamed_list = [] 5287 5288 for annotation_field in annotation_fields: 5289 5290 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5291 annotation_fields_new_name = annotation_fields.get( 5292 annotation_field, annotation_field 5293 ) 5294 if not annotation_fields_new_name: 5295 annotation_fields_new_name = annotation_field 5296 5297 if ( 5298 force_update_annotation 5299 or annotation_fields_new_name not in self.get_header().infos 5300 ): 5301 annotation_list.append(annotation_field) 5302 annotation_renamed_list.append(annotation_fields_new_name) 5303 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5304 log.warning( 5305 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5306 ) 5307 5308 # Add rename info 5309 run_parallel_commands( 5310 [ 5311 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5312 ], 5313 1, 5314 ) 5315 5316 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5317 log.debug("annotation_list: " + str(annotation_list)) 5318 5319 # protocol 5320 protocol = annotation 5321 5322 # argument 5323 argument = "" 5324 5325 # operation 5326 operation = "f" 5327 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5328 "ensGene" 5329 ): 5330 operation = "g" 5331 if options.get("genebase", None): 5332 argument = f"""'{options.get("genebase","")}'""" 5333 elif annotation in ["cytoBand"]: 5334 operation = "r" 5335 5336 # argument option 5337 argument_option = "" 5338 if argument != "": 5339 argument_option = " --argument " + argument 5340 5341 # command options 5342 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5343 for option in options: 5344 if option not in ["genebase"]: 5345 command_options += f""" --{option}={options[option]}""" 5346 5347 # Command 5348 5349 # Command - Annovar 5350 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5351 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5352 5353 # Command - start pipe 5354 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5355 5356 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5357 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5358 5359 # Command - Special characters (refGene annotation) 5360 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5361 5362 # Command - Clean empty fields (with value ".") 5363 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5364 5365 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5366 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5367 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5368 # for ann in annotation_renamed_list: 5369 for ann in annotation_list: 5370 annovar_fields_to_keep.append(f"^INFO/{ann}") 5371 5372 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5373 5374 # Command - indexing 5375 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5376 5377 log.debug(f"Annotation - Annovar command: {command_annovar}") 5378 run_parallel_commands([command_annovar], 1) 5379 5380 # Error messages 5381 log.info(f"Error/Warning messages:") 5382 error_message_command_all = [] 5383 error_message_command_warning = [] 5384 error_message_command_err = [] 5385 for err_file in err_files: 5386 with open(err_file, "r") as f: 5387 for line in f: 5388 message = line.strip() 5389 error_message_command_all.append(message) 5390 if line.startswith("[W::") or line.startswith("WARNING"): 5391 error_message_command_warning.append(message) 5392 if line.startswith("[E::") or line.startswith("ERROR"): 5393 error_message_command_err.append( 5394 f"{err_file}: " + message 5395 ) 5396 # log info 5397 for message in list( 5398 set(error_message_command_err + error_message_command_warning) 5399 ): 5400 log.info(f" {message}") 5401 # debug info 5402 for message in list(set(error_message_command_all)): 5403 log.debug(f" {message}") 5404 # failed 5405 if len(error_message_command_err): 5406 log.error("Annotation failed: Error in commands") 5407 raise ValueError("Annotation failed: Error in commands") 5408 5409 if tmp_annotates_vcf_name_list: 5410 5411 # List of annotated files 5412 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5413 5414 # Tmp file 5415 tmp_annotate_vcf = NamedTemporaryFile( 5416 prefix=self.get_prefix(), 5417 dir=self.get_tmp_dir(), 5418 suffix=".vcf.gz", 5419 delete=False, 5420 ) 5421 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5422 tmp_files.append(tmp_annotate_vcf_name) 5423 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5424 err_files.append(tmp_annotate_vcf_name_err) 5425 tmp_files.append(tmp_annotate_vcf_name_err) 5426 5427 # Command merge 5428 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5429 log.info( 5430 f"Annotation Annovar - Annotation merging " 5431 + str(len(tmp_annotates_vcf_name_list)) 5432 + " annotated files" 5433 ) 5434 log.debug(f"Annotation - merge command: {merge_command}") 5435 run_parallel_commands([merge_command], 1) 5436 5437 # Find annotation in header 5438 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5439 header_list = self.read_vcf_header(f) 5440 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5441 5442 for ann in annovar_vcf_header.infos: 5443 if ann not in self.get_header().infos: 5444 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5445 5446 # Update variants 5447 log.info(f"Annotation Annovar - Updating...") 5448 self.update_from_vcf(tmp_annotate_vcf_name) 5449 5450 # Clean files 5451 # Tmp file remove command 5452 if True: 5453 tmp_files_remove_command = "" 5454 if tmp_files: 5455 tmp_files_remove_command = " ".join(tmp_files) 5456 clean_command = f" rm -f {tmp_files_remove_command} " 5457 log.debug(f"Annotation Annovar - Annotation cleaning ") 5458 log.debug(f"Annotation - cleaning command: {clean_command}") 5459 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5462 def annotation_parquet(self, threads: int = None) -> None: 5463 """ 5464 It takes a VCF file, and annotates it with a parquet file 5465 5466 :param threads: number of threads to use for the annotation 5467 :return: the value of the variable "result". 5468 """ 5469 5470 # DEBUG 5471 log.debug("Start annotation with parquet databases") 5472 5473 # Threads 5474 if not threads: 5475 threads = self.get_threads() 5476 log.debug("Threads: " + str(threads)) 5477 5478 # DEBUG 5479 delete_tmp = True 5480 if self.get_config().get("verbosity", "warning") in ["debug"]: 5481 delete_tmp = False 5482 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5483 5484 # Config 5485 databases_folders = set( 5486 self.get_config() 5487 .get("folders", {}) 5488 .get("databases", {}) 5489 .get("annotations", ["."]) 5490 + self.get_config() 5491 .get("folders", {}) 5492 .get("databases", {}) 5493 .get("parquet", ["."]) 5494 ) 5495 log.debug("Databases annotations: " + str(databases_folders)) 5496 5497 # Param 5498 annotations = ( 5499 self.get_param() 5500 .get("annotation", {}) 5501 .get("parquet", {}) 5502 .get("annotations", None) 5503 ) 5504 log.debug("Annotations: " + str(annotations)) 5505 5506 # Assembly 5507 assembly = self.get_param().get( 5508 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5509 ) 5510 5511 # Force Update Annotation 5512 force_update_annotation = ( 5513 self.get_param() 5514 .get("annotation", {}) 5515 .get("options", {}) 5516 .get("annotations_update", False) 5517 ) 5518 log.debug(f"force_update_annotation={force_update_annotation}") 5519 force_append_annotation = ( 5520 self.get_param() 5521 .get("annotation", {}) 5522 .get("options", {}) 5523 .get("annotations_append", False) 5524 ) 5525 log.debug(f"force_append_annotation={force_append_annotation}") 5526 5527 # Data 5528 table_variants = self.get_table_variants() 5529 5530 # Check if not empty 5531 log.debug("Check if not empty") 5532 sql_query_chromosomes_df = self.get_query_to_df( 5533 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5534 ) 5535 if not sql_query_chromosomes_df["count"][0]: 5536 log.info(f"VCF empty") 5537 return 5538 5539 # VCF header 5540 vcf_reader = self.get_header() 5541 log.debug("Initial header: " + str(vcf_reader.infos)) 5542 5543 # Nb Variants POS 5544 log.debug("NB Variants Start") 5545 nb_variants = self.conn.execute( 5546 f"SELECT count(*) AS count FROM variants" 5547 ).fetchdf()["count"][0] 5548 log.debug("NB Variants Stop") 5549 5550 # Existing annotations 5551 for vcf_annotation in self.get_header().infos: 5552 5553 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5554 log.debug( 5555 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5556 ) 5557 5558 # Added columns 5559 added_columns = [] 5560 5561 # drop indexes 5562 log.debug(f"Drop indexes...") 5563 self.drop_indexes() 5564 5565 if annotations: 5566 5567 if "ALL" in annotations: 5568 5569 all_param = annotations.get("ALL", {}) 5570 all_param_formats = all_param.get("formats", None) 5571 all_param_releases = all_param.get("releases", None) 5572 5573 databases_infos_dict = self.scan_databases( 5574 database_formats=all_param_formats, 5575 database_releases=all_param_releases, 5576 ) 5577 for database_infos in databases_infos_dict.keys(): 5578 if database_infos not in annotations: 5579 annotations[database_infos] = {"INFO": None} 5580 5581 for annotation in annotations: 5582 5583 if annotation in ["ALL"]: 5584 continue 5585 5586 # Annotation Name 5587 annotation_name = os.path.basename(annotation) 5588 5589 # Annotation fields 5590 annotation_fields = annotations[annotation] 5591 if not annotation_fields: 5592 annotation_fields = {"INFO": None} 5593 5594 log.debug(f"Annotation '{annotation_name}'") 5595 log.debug( 5596 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5597 ) 5598 5599 # Create Database 5600 database = Database( 5601 database=annotation, 5602 databases_folders=databases_folders, 5603 assembly=assembly, 5604 ) 5605 5606 # Find files 5607 parquet_file = database.get_database() 5608 parquet_hdr_file = database.get_header_file() 5609 parquet_type = database.get_type() 5610 5611 # Check if files exists 5612 if not parquet_file or not parquet_hdr_file: 5613 log.error("Annotation failed: file not found") 5614 raise ValueError("Annotation failed: file not found") 5615 else: 5616 # Get parquet connexion 5617 parquet_sql_attach = database.get_sql_database_attach( 5618 output="query" 5619 ) 5620 if parquet_sql_attach: 5621 self.conn.execute(parquet_sql_attach) 5622 parquet_file_link = database.get_sql_database_link() 5623 # Log 5624 log.debug( 5625 f"Annotation '{annotation_name}' - file: " 5626 + str(parquet_file) 5627 + " and " 5628 + str(parquet_hdr_file) 5629 ) 5630 5631 # Database full header columns 5632 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5633 parquet_hdr_file 5634 ) 5635 # Log 5636 log.debug( 5637 "Annotation database header columns : " 5638 + str(parquet_hdr_vcf_header_columns) 5639 ) 5640 5641 # Load header as VCF object 5642 parquet_hdr_vcf_header_infos = database.get_header().infos 5643 # Log 5644 log.debug( 5645 "Annotation database header: " 5646 + str(parquet_hdr_vcf_header_infos) 5647 ) 5648 5649 # Get extra infos 5650 parquet_columns = database.get_extra_columns() 5651 # Log 5652 log.debug("Annotation database Columns: " + str(parquet_columns)) 5653 5654 # Add extra columns if "ALL" in annotation_fields 5655 # if "ALL" in annotation_fields: 5656 # allow_add_extra_column = True 5657 if "ALL" in annotation_fields and database.get_extra_columns(): 5658 for extra_column in database.get_extra_columns(): 5659 if ( 5660 extra_column not in annotation_fields 5661 and extra_column.replace("INFO/", "") 5662 not in parquet_hdr_vcf_header_infos 5663 ): 5664 parquet_hdr_vcf_header_infos[extra_column] = ( 5665 vcf.parser._Info( 5666 extra_column, 5667 ".", 5668 "String", 5669 f"{extra_column} description", 5670 "unknown", 5671 "unknown", 5672 self.code_type_map["String"], 5673 ) 5674 ) 5675 5676 # For all fields in database 5677 annotation_fields_all = False 5678 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5679 annotation_fields_all = True 5680 annotation_fields = { 5681 key: key for key in parquet_hdr_vcf_header_infos 5682 } 5683 5684 log.debug( 5685 "Annotation database header - All annotations added: " 5686 + str(annotation_fields) 5687 ) 5688 5689 # Init 5690 5691 # List of annotation fields to use 5692 sql_query_annotation_update_info_sets = [] 5693 5694 # List of annotation to agregate 5695 sql_query_annotation_to_agregate = [] 5696 5697 # Number of fields 5698 nb_annotation_field = 0 5699 5700 # Annotation fields processed 5701 annotation_fields_processed = [] 5702 5703 # Columns mapping 5704 map_columns = database.map_columns( 5705 columns=annotation_fields, prefixes=["INFO/"] 5706 ) 5707 5708 # Query dict for fields to remove (update option) 5709 query_dict_remove = {} 5710 5711 # Fetch Anotation fields 5712 for annotation_field in annotation_fields: 5713 5714 # annotation_field_column 5715 annotation_field_column = map_columns.get( 5716 annotation_field, "INFO" 5717 ) 5718 5719 # field new name, if parametered 5720 annotation_fields_new_name = annotation_fields.get( 5721 annotation_field, annotation_field 5722 ) 5723 if not annotation_fields_new_name: 5724 annotation_fields_new_name = annotation_field 5725 5726 # To annotate 5727 # force_update_annotation = True 5728 # force_append_annotation = True 5729 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5730 if annotation_field in parquet_hdr_vcf_header_infos and ( 5731 force_update_annotation 5732 or force_append_annotation 5733 or ( 5734 annotation_fields_new_name 5735 not in self.get_header().infos 5736 ) 5737 ): 5738 5739 # Add field to annotation to process list 5740 annotation_fields_processed.append( 5741 annotation_fields_new_name 5742 ) 5743 5744 # explode infos for the field 5745 annotation_fields_new_name_info_msg = "" 5746 if ( 5747 force_update_annotation 5748 and annotation_fields_new_name 5749 in self.get_header().infos 5750 ): 5751 # Remove field from INFO 5752 query = f""" 5753 UPDATE {table_variants} as table_variants 5754 SET INFO = REGEXP_REPLACE( 5755 concat(table_variants.INFO,''), 5756 ';*{annotation_fields_new_name}=[^;]*', 5757 '' 5758 ) 5759 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5760 """ 5761 annotation_fields_new_name_info_msg = " [update]" 5762 query_dict_remove[ 5763 f"remove 'INFO/{annotation_fields_new_name}'" 5764 ] = query 5765 5766 # Sep between fields in INFO 5767 nb_annotation_field += 1 5768 if nb_annotation_field > 1: 5769 annotation_field_sep = ";" 5770 else: 5771 annotation_field_sep = "" 5772 5773 log.info( 5774 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5775 ) 5776 5777 # Add INFO field to header 5778 parquet_hdr_vcf_header_infos_number = ( 5779 parquet_hdr_vcf_header_infos[annotation_field].num 5780 or "." 5781 ) 5782 parquet_hdr_vcf_header_infos_type = ( 5783 parquet_hdr_vcf_header_infos[annotation_field].type 5784 or "String" 5785 ) 5786 parquet_hdr_vcf_header_infos_description = ( 5787 parquet_hdr_vcf_header_infos[annotation_field].desc 5788 or f"{annotation_field} description" 5789 ) 5790 parquet_hdr_vcf_header_infos_source = ( 5791 parquet_hdr_vcf_header_infos[annotation_field].source 5792 or "unknown" 5793 ) 5794 parquet_hdr_vcf_header_infos_version = ( 5795 parquet_hdr_vcf_header_infos[annotation_field].version 5796 or "unknown" 5797 ) 5798 5799 vcf_reader.infos[annotation_fields_new_name] = ( 5800 vcf.parser._Info( 5801 annotation_fields_new_name, 5802 parquet_hdr_vcf_header_infos_number, 5803 parquet_hdr_vcf_header_infos_type, 5804 parquet_hdr_vcf_header_infos_description, 5805 parquet_hdr_vcf_header_infos_source, 5806 parquet_hdr_vcf_header_infos_version, 5807 self.code_type_map[ 5808 parquet_hdr_vcf_header_infos_type 5809 ], 5810 ) 5811 ) 5812 5813 # Append 5814 if force_append_annotation: 5815 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5816 else: 5817 query_case_when_append = "" 5818 5819 # Annotation/Update query fields 5820 # Found in INFO column 5821 if ( 5822 annotation_field_column == "INFO" 5823 and "INFO" in parquet_hdr_vcf_header_columns 5824 ): 5825 sql_query_annotation_update_info_sets.append( 5826 f""" 5827 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5828 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5829 ELSE '' 5830 END 5831 """ 5832 ) 5833 # Found in a specific column 5834 else: 5835 # sql_query_annotation_update_info_sets.append( 5836 # f""" 5837 # CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5838 # THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5839 # ELSE '' 5840 # END 5841 # """ 5842 # ) 5843 sql_query_annotation_update_info_sets.append( 5844 f""" 5845 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5846 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 5847 ELSE '' 5848 END 5849 """ 5850 ) 5851 sql_query_annotation_to_agregate.append( 5852 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5853 ) 5854 5855 # Not to annotate 5856 else: 5857 5858 if force_update_annotation: 5859 annotation_message = "forced" 5860 else: 5861 annotation_message = "skipped" 5862 5863 if annotation_field not in parquet_hdr_vcf_header_infos: 5864 log.warning( 5865 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5866 ) 5867 if annotation_fields_new_name in self.get_header().infos: 5868 log.warning( 5869 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5870 ) 5871 5872 # Check if ALL fields have to be annotated. Thus concat all INFO field 5873 # allow_annotation_full_info = True 5874 allow_annotation_full_info = not force_append_annotation 5875 5876 if parquet_type in ["regions"]: 5877 allow_annotation_full_info = False 5878 5879 if ( 5880 allow_annotation_full_info 5881 and nb_annotation_field == len(annotation_fields) 5882 and annotation_fields_all 5883 and ( 5884 "INFO" in parquet_hdr_vcf_header_columns 5885 and "INFO" in database.get_extra_columns() 5886 ) 5887 ): 5888 log.debug("Column INFO annotation enabled") 5889 sql_query_annotation_update_info_sets = [] 5890 sql_query_annotation_update_info_sets.append( 5891 f" table_parquet.INFO " 5892 ) 5893 5894 if sql_query_annotation_update_info_sets: 5895 5896 # Annotate 5897 log.info(f"Annotation '{annotation_name}' - Annotation...") 5898 5899 # Join query annotation update info sets for SQL 5900 sql_query_annotation_update_info_sets_sql = ",".join( 5901 sql_query_annotation_update_info_sets 5902 ) 5903 5904 # Check chromosomes list (and variants infos) 5905 sql_query_chromosomes = f""" 5906 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5907 FROM {table_variants} as table_variants 5908 GROUP BY table_variants."#CHROM" 5909 ORDER BY table_variants."#CHROM" 5910 """ 5911 sql_query_chromosomes_df = self.conn.execute( 5912 sql_query_chromosomes 5913 ).df() 5914 sql_query_chromosomes_dict = { 5915 entry["CHROM"]: { 5916 "count": entry["count_variants"], 5917 "min": entry["min_variants"], 5918 "max": entry["max_variants"], 5919 } 5920 for index, entry in sql_query_chromosomes_df.iterrows() 5921 } 5922 5923 # Init 5924 nb_of_query = 0 5925 nb_of_variant_annotated = 0 5926 query_dict = query_dict_remove 5927 5928 # for chrom in sql_query_chromosomes_df["CHROM"]: 5929 for chrom in sql_query_chromosomes_dict: 5930 5931 # Number of variant by chromosome 5932 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5933 chrom, {} 5934 ).get("count", 0) 5935 5936 log.debug( 5937 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5938 ) 5939 5940 # Annotation with regions database 5941 if parquet_type in ["regions"]: 5942 sql_query_annotation_from_clause = f""" 5943 FROM ( 5944 SELECT 5945 '{chrom}' AS \"#CHROM\", 5946 table_variants_from.\"POS\" AS \"POS\", 5947 {",".join(sql_query_annotation_to_agregate)} 5948 FROM {table_variants} as table_variants_from 5949 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5950 table_parquet_from."#CHROM" = '{chrom}' 5951 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5952 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5953 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5954 ) 5955 ) 5956 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5957 GROUP BY table_variants_from.\"POS\" 5958 ) 5959 as table_parquet 5960 """ 5961 5962 sql_query_annotation_where_clause = """ 5963 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5964 AND table_parquet.\"POS\" = table_variants.\"POS\" 5965 """ 5966 5967 # Annotation with variants database 5968 else: 5969 sql_query_annotation_from_clause = f""" 5970 FROM {parquet_file_link} as table_parquet 5971 """ 5972 sql_query_annotation_where_clause = f""" 5973 table_variants."#CHROM" = '{chrom}' 5974 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5975 AND table_parquet.\"POS\" = table_variants.\"POS\" 5976 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5977 AND table_parquet.\"REF\" = table_variants.\"REF\" 5978 """ 5979 5980 # Create update query 5981 sql_query_annotation_chrom_interval_pos = f""" 5982 UPDATE {table_variants} as table_variants 5983 SET INFO = 5984 concat( 5985 CASE WHEN table_variants.INFO NOT IN ('','.') 5986 THEN table_variants.INFO 5987 ELSE '' 5988 END 5989 , 5990 CASE WHEN table_variants.INFO NOT IN ('','.') 5991 AND ( 5992 concat({sql_query_annotation_update_info_sets_sql}) 5993 ) 5994 NOT IN ('','.') 5995 THEN ';' 5996 ELSE '' 5997 END 5998 , 5999 {sql_query_annotation_update_info_sets_sql} 6000 ) 6001 {sql_query_annotation_from_clause} 6002 WHERE {sql_query_annotation_where_clause} 6003 ; 6004 """ 6005 6006 # Add update query to dict 6007 query_dict[ 6008 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6009 ] = sql_query_annotation_chrom_interval_pos 6010 6011 nb_of_query = len(query_dict) 6012 num_query = 0 6013 6014 # SET max_expression_depth TO x 6015 self.conn.execute("SET max_expression_depth TO 10000") 6016 6017 for query_name in query_dict: 6018 query = query_dict[query_name] 6019 num_query += 1 6020 log.info( 6021 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6022 ) 6023 result = self.conn.execute(query) 6024 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6025 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6026 log.info( 6027 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6028 ) 6029 6030 log.info( 6031 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6032 ) 6033 6034 else: 6035 6036 log.info( 6037 f"Annotation '{annotation_name}' - No Annotations available" 6038 ) 6039 6040 log.debug("Final header: " + str(vcf_reader.infos)) 6041 6042 # Remove added columns 6043 for added_column in added_columns: 6044 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6046 def annotation_splice(self, threads: int = None) -> None: 6047 """ 6048 This function annotate with snpEff 6049 6050 :param threads: The number of threads to use 6051 :return: the value of the variable "return_value". 6052 """ 6053 6054 # DEBUG 6055 log.debug("Start annotation with splice tools") 6056 6057 # Threads 6058 if not threads: 6059 threads = self.get_threads() 6060 log.debug("Threads: " + str(threads)) 6061 6062 # DEBUG 6063 delete_tmp = True 6064 if self.get_config().get("verbosity", "warning") in ["debug"]: 6065 delete_tmp = False 6066 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6067 6068 # Config 6069 config = self.get_config() 6070 log.debug("Config: " + str(config)) 6071 splice_config = config.get("tools", {}).get("splice", {}) 6072 if not splice_config: 6073 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6074 if not splice_config: 6075 msg_err = "No Splice tool config" 6076 log.error(msg_err) 6077 raise ValueError(msg_err) 6078 log.debug(f"splice_config={splice_config}") 6079 6080 # Config - Folders - Databases 6081 databases_folders = ( 6082 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6083 ) 6084 log.debug("Databases annotations: " + str(databases_folders)) 6085 6086 # Splice docker image 6087 splice_docker_image = splice_config.get("docker").get("image") 6088 6089 # Pull splice image if it's not already there 6090 if not check_docker_image_exists(splice_docker_image): 6091 log.warning( 6092 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6093 ) 6094 try: 6095 command(f"docker pull {splice_config.get('docker').get('image')}") 6096 except subprocess.CalledProcessError: 6097 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6098 log.error(msg_err) 6099 raise ValueError(msg_err) 6100 return None 6101 6102 # Config - splice databases 6103 splice_databases = ( 6104 config.get("folders", {}) 6105 .get("databases", {}) 6106 .get("splice", DEFAULT_SPLICE_FOLDER) 6107 ) 6108 splice_databases = full_path(splice_databases) 6109 6110 # Param 6111 param = self.get_param() 6112 log.debug("Param: " + str(param)) 6113 6114 # Param 6115 options = param.get("annotation", {}).get("splice", {}) 6116 log.debug("Options: " + str(options)) 6117 6118 # Data 6119 table_variants = self.get_table_variants() 6120 6121 # Check if not empty 6122 log.debug("Check if not empty") 6123 sql_query_chromosomes = ( 6124 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6125 ) 6126 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6127 log.info("VCF empty") 6128 return None 6129 6130 # Export in VCF 6131 log.debug("Create initial file to annotate") 6132 6133 # Create output folder 6134 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6135 if not os.path.exists(output_folder): 6136 Path(output_folder).mkdir(parents=True, exist_ok=True) 6137 6138 # Create tmp VCF file 6139 tmp_vcf = NamedTemporaryFile( 6140 prefix=self.get_prefix(), 6141 dir=output_folder, 6142 suffix=".vcf", 6143 delete=False, 6144 ) 6145 tmp_vcf_name = tmp_vcf.name 6146 6147 # VCF header 6148 header = self.get_header() 6149 6150 # Existing annotations 6151 for vcf_annotation in self.get_header().infos: 6152 6153 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6154 log.debug( 6155 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6156 ) 6157 6158 # Memory limit 6159 if config.get("memory", None): 6160 memory_limit = config.get("memory", "8G").upper() 6161 # upper() 6162 else: 6163 memory_limit = "8G" 6164 log.debug(f"memory_limit: {memory_limit}") 6165 6166 # Check number of variants to annotate 6167 where_clause_regex_spliceai = r"SpliceAI_\w+" 6168 where_clause_regex_spip = r"SPiP_\w+" 6169 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6170 df_list_of_variants_to_annotate = self.get_query_to_df( 6171 query=f""" SELECT * FROM variants {where_clause} """ 6172 ) 6173 if len(df_list_of_variants_to_annotate) == 0: 6174 log.warning( 6175 f"No variants to annotate with splice. Variants probably already annotated with splice" 6176 ) 6177 return None 6178 else: 6179 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6180 6181 # Export VCF file 6182 self.export_variant_vcf( 6183 vcf_file=tmp_vcf_name, 6184 remove_info=True, 6185 add_samples=True, 6186 index=False, 6187 where_clause=where_clause, 6188 ) 6189 6190 # Create docker container and launch splice analysis 6191 if splice_config: 6192 6193 # Splice mount folders 6194 mount_folders = splice_config.get("mount", {}) 6195 6196 # Genome mount 6197 mount_folders[ 6198 config.get("folders", {}) 6199 .get("databases", {}) 6200 .get("genomes", DEFAULT_GENOME_FOLDER) 6201 ] = "ro" 6202 6203 # SpliceAI mount 6204 mount_folders[ 6205 config.get("folders", {}) 6206 .get("databases", {}) 6207 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6208 ] = "ro" 6209 6210 # Genome mount 6211 mount_folders[ 6212 config.get("folders", {}) 6213 .get("databases", {}) 6214 .get("spip", DEFAULT_SPIP_FOLDER) 6215 ] = "ro" 6216 6217 # Mount folders 6218 mount = [] 6219 6220 # Config mount 6221 mount = [ 6222 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6223 for path, mode in mount_folders.items() 6224 ] 6225 6226 if any(value for value in splice_config.values() if value is None): 6227 log.warning("At least one splice config parameter is empty") 6228 return None 6229 6230 # Params in splice nf 6231 def check_values(dico: dict): 6232 """ 6233 Ensure parameters for NF splice pipeline 6234 """ 6235 for key, val in dico.items(): 6236 if key == "genome": 6237 if any( 6238 assemb in options.get("genome", {}) 6239 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6240 ): 6241 yield f"--{key} hg19" 6242 elif any( 6243 assemb in options.get("genome", {}) 6244 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6245 ): 6246 yield f"--{key} hg38" 6247 elif ( 6248 (isinstance(val, str) and val) 6249 or isinstance(val, int) 6250 or isinstance(val, bool) 6251 ): 6252 yield f"--{key} {val}" 6253 6254 # Genome 6255 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6256 options["genome"] = genome 6257 6258 # NF params 6259 nf_params = [] 6260 6261 # Add options 6262 if options: 6263 nf_params = list(check_values(options)) 6264 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6265 else: 6266 log.debug("No NF params provided") 6267 6268 # Add threads 6269 if "threads" not in options.keys(): 6270 nf_params.append(f"--threads {threads}") 6271 6272 # Genome path 6273 genome_path = find_genome( 6274 config.get("folders", {}) 6275 .get("databases", {}) 6276 .get("genomes", DEFAULT_GENOME_FOLDER), 6277 file=f"{genome}.fa", 6278 ) 6279 # Add genome path 6280 if not genome_path: 6281 raise ValueError( 6282 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6283 ) 6284 else: 6285 log.debug(f"Genome: {genome_path}") 6286 nf_params.append(f"--genome_path {genome_path}") 6287 6288 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6289 """ 6290 Setting up updated databases for SPiP and SpliceAI 6291 """ 6292 6293 try: 6294 6295 # SpliceAI assembly transcriptome 6296 spliceai_assembly = os.path.join( 6297 config.get("folders", {}) 6298 .get("databases", {}) 6299 .get("spliceai", {}), 6300 options.get("genome"), 6301 "transcriptome", 6302 ) 6303 spip_assembly = options.get("genome") 6304 6305 spip = find( 6306 f"transcriptome_{spip_assembly}.RData", 6307 config.get("folders", {}).get("databases", {}).get("spip", {}), 6308 ) 6309 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6310 log.debug(f"SPiP annotations: {spip}") 6311 log.debug(f"SpliceAI annotations: {spliceai}") 6312 if spip and spliceai: 6313 return [ 6314 f"--spip_transcriptome {spip}", 6315 f"--spliceai_annotations {spliceai}", 6316 ] 6317 else: 6318 # TODO crash and go on with basic annotations ? 6319 # raise ValueError( 6320 # "Can't find splice databases in configuration EXIT" 6321 # ) 6322 log.warning( 6323 "Can't find splice databases in configuration, use annotations file from image" 6324 ) 6325 except TypeError: 6326 log.warning( 6327 "Can't find splice databases in configuration, use annotations file from image" 6328 ) 6329 return [] 6330 6331 # Add options, check if transcriptome option have already beend provided 6332 if ( 6333 "spip_transcriptome" not in nf_params 6334 and "spliceai_transcriptome" not in nf_params 6335 ): 6336 splice_reference = splice_annotations(options, config) 6337 if splice_reference: 6338 nf_params.extend(splice_reference) 6339 6340 nf_params.append(f"--output_folder {output_folder}") 6341 6342 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6343 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6344 log.debug(cmd) 6345 6346 splice_config["docker"]["command"] = cmd 6347 6348 docker_cmd = get_bin_command( 6349 tool="splice", 6350 bin_type="docker", 6351 config=config, 6352 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6353 add_options=f"--name {random_uuid} {' '.join(mount)}", 6354 ) 6355 6356 # Docker debug 6357 # if splice_config.get("rm_container"): 6358 # rm_container = "--rm" 6359 # else: 6360 # rm_container = "" 6361 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6362 6363 log.debug(docker_cmd) 6364 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6365 log.debug(res.stdout) 6366 if res.stderr: 6367 log.error(res.stderr) 6368 res.check_returncode() 6369 else: 6370 log.warning(f"Splice tool configuration not found: {config}") 6371 6372 # Update variants 6373 log.info("Annotation - Updating...") 6374 # Test find output vcf 6375 log.debug( 6376 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6377 ) 6378 output_vcf = [] 6379 # Wrong folder to look in 6380 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6381 if ( 6382 files 6383 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6384 ): 6385 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6386 # log.debug(os.listdir(options.get("output_folder"))) 6387 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6388 if not output_vcf: 6389 log.debug( 6390 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6391 ) 6392 else: 6393 # Get new header from annotated vcf 6394 log.debug(f"Initial header: {len(header.infos)} fields") 6395 # Create new header with splice infos 6396 new_vcf = Variants(input=output_vcf[0]) 6397 new_vcf_header = new_vcf.get_header().infos 6398 for keys, infos in new_vcf_header.items(): 6399 if keys not in header.infos.keys(): 6400 header.infos[keys] = infos 6401 log.debug(f"New header: {len(header.infos)} fields") 6402 log.debug(f"Splice tmp output: {output_vcf[0]}") 6403 self.update_from_vcf(output_vcf[0]) 6404 6405 # Remove folder 6406 remove_if_exists(output_folder)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6412 def get_config_default(self, name: str) -> dict: 6413 """ 6414 The function `get_config_default` returns a dictionary containing default configurations for 6415 various calculations and prioritizations. 6416 6417 :param name: The `get_config_default` function returns a dictionary containing default 6418 configurations for different calculations and prioritizations. The `name` parameter is used to 6419 specify which specific configuration to retrieve from the dictionary 6420 :type name: str 6421 :return: The function `get_config_default` returns a dictionary containing default configuration 6422 settings for different calculations and prioritizations. The specific configuration settings are 6423 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6424 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6425 returned. If there is no match, an empty dictionary is returned. 6426 """ 6427 6428 config_default = { 6429 "calculations": { 6430 "variant_chr_pos_alt_ref": { 6431 "type": "sql", 6432 "name": "variant_chr_pos_alt_ref", 6433 "description": "Create a variant ID with chromosome, position, alt and ref", 6434 "available": False, 6435 "output_column_name": "variant_chr_pos_alt_ref", 6436 "output_column_type": "String", 6437 "output_column_description": "variant ID with chromosome, position, alt and ref", 6438 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6439 "operation_info": True, 6440 }, 6441 "VARTYPE": { 6442 "type": "sql", 6443 "name": "VARTYPE", 6444 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6445 "available": True, 6446 "output_column_name": "VARTYPE", 6447 "output_column_type": "String", 6448 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6449 "operation_query": """ 6450 CASE 6451 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6452 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6453 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6454 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6455 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6456 ELSE 'UNDEFINED' 6457 END 6458 """, 6459 "info_fields": ["SVTYPE"], 6460 "operation_info": True, 6461 }, 6462 "snpeff_hgvs": { 6463 "type": "python", 6464 "name": "snpeff_hgvs", 6465 "description": "HGVS nomenclatures from snpEff annotation", 6466 "available": True, 6467 "function_name": "calculation_extract_snpeff_hgvs", 6468 "function_params": ["snpeff_hgvs", "ANN"], 6469 }, 6470 "snpeff_ann_explode": { 6471 "type": "python", 6472 "name": "snpeff_ann_explode", 6473 "description": "Explode snpEff annotations with uniquify values", 6474 "available": True, 6475 "function_name": "calculation_snpeff_ann_explode", 6476 "function_params": [False, "fields", "snpeff_", "ANN"], 6477 }, 6478 "snpeff_ann_explode_uniquify": { 6479 "type": "python", 6480 "name": "snpeff_ann_explode_uniquify", 6481 "description": "Explode snpEff annotations", 6482 "available": True, 6483 "function_name": "calculation_snpeff_ann_explode", 6484 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6485 }, 6486 "snpeff_ann_explode_json": { 6487 "type": "python", 6488 "name": "snpeff_ann_explode_json", 6489 "description": "Explode snpEff annotations in JSON format", 6490 "available": True, 6491 "function_name": "calculation_snpeff_ann_explode", 6492 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6493 }, 6494 "NOMEN": { 6495 "type": "python", 6496 "name": "NOMEN", 6497 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6498 "available": True, 6499 "function_name": "calculation_extract_nomen", 6500 "function_params": [], 6501 }, 6502 "FINDBYPIPELINE": { 6503 "type": "python", 6504 "name": "FINDBYPIPELINE", 6505 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6506 "available": True, 6507 "function_name": "calculation_find_by_pipeline", 6508 "function_params": ["findbypipeline"], 6509 }, 6510 "FINDBYSAMPLE": { 6511 "type": "python", 6512 "name": "FINDBYSAMPLE", 6513 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6514 "available": True, 6515 "function_name": "calculation_find_by_pipeline", 6516 "function_params": ["findbysample"], 6517 }, 6518 "GENOTYPECONCORDANCE": { 6519 "type": "python", 6520 "name": "GENOTYPECONCORDANCE", 6521 "description": "Concordance of genotype for multi caller VCF", 6522 "available": True, 6523 "function_name": "calculation_genotype_concordance", 6524 "function_params": [], 6525 }, 6526 "BARCODE": { 6527 "type": "python", 6528 "name": "BARCODE", 6529 "description": "BARCODE as VaRank tool", 6530 "available": True, 6531 "function_name": "calculation_barcode", 6532 "function_params": [], 6533 }, 6534 "BARCODEFAMILY": { 6535 "type": "python", 6536 "name": "BARCODEFAMILY", 6537 "description": "BARCODEFAMILY as VaRank tool", 6538 "available": True, 6539 "function_name": "calculation_barcode_family", 6540 "function_params": ["BCF"], 6541 }, 6542 "TRIO": { 6543 "type": "python", 6544 "name": "TRIO", 6545 "description": "Inheritance for a trio family", 6546 "available": True, 6547 "function_name": "calculation_trio", 6548 "function_params": [], 6549 }, 6550 "VAF": { 6551 "type": "python", 6552 "name": "VAF", 6553 "description": "Variant Allele Frequency (VAF) harmonization", 6554 "available": True, 6555 "function_name": "calculation_vaf_normalization", 6556 "function_params": [], 6557 }, 6558 "VAF_stats": { 6559 "type": "python", 6560 "name": "VAF_stats", 6561 "description": "Variant Allele Frequency (VAF) statistics", 6562 "available": True, 6563 "function_name": "calculation_genotype_stats", 6564 "function_params": ["VAF"], 6565 }, 6566 "DP_stats": { 6567 "type": "python", 6568 "name": "DP_stats", 6569 "description": "Depth (DP) statistics", 6570 "available": True, 6571 "function_name": "calculation_genotype_stats", 6572 "function_params": ["DP"], 6573 }, 6574 "variant_id": { 6575 "type": "python", 6576 "name": "variant_id", 6577 "description": "Variant ID generated from variant position and type", 6578 "available": True, 6579 "function_name": "calculation_variant_id", 6580 "function_params": [], 6581 }, 6582 "transcripts_json": { 6583 "type": "python", 6584 "name": "transcripts_json", 6585 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6586 "available": True, 6587 "function_name": "calculation_transcripts_annotation", 6588 "function_params": ["transcripts_json", None], 6589 }, 6590 "transcripts_ann": { 6591 "type": "python", 6592 "name": "transcripts_ann", 6593 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6594 "available": True, 6595 "function_name": "calculation_transcripts_annotation", 6596 "function_params": [None, "transcripts_ann"], 6597 }, 6598 "transcripts_annotations": { 6599 "type": "python", 6600 "name": "transcripts_annotations", 6601 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6602 "available": True, 6603 "function_name": "calculation_transcripts_annotation", 6604 "function_params": [None, None], 6605 }, 6606 "transcripts_prioritization": { 6607 "type": "python", 6608 "name": "transcripts_prioritization", 6609 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6610 "available": True, 6611 "function_name": "calculation_transcripts_prioritization", 6612 "function_params": [], 6613 }, 6614 }, 6615 "prioritizations": { 6616 "default": { 6617 "filter": [ 6618 { 6619 "type": "notequals", 6620 "value": "!PASS|\\.", 6621 "score": 0, 6622 "flag": "FILTERED", 6623 "comment": ["Bad variant quality"], 6624 }, 6625 { 6626 "type": "equals", 6627 "value": "REJECT", 6628 "score": -20, 6629 "flag": "PASS", 6630 "comment": ["Bad variant quality"], 6631 }, 6632 ], 6633 "DP": [ 6634 { 6635 "type": "gte", 6636 "value": "50", 6637 "score": 5, 6638 "flag": "PASS", 6639 "comment": ["DP higher than 50"], 6640 } 6641 ], 6642 "ANN": [ 6643 { 6644 "type": "contains", 6645 "value": "HIGH", 6646 "score": 5, 6647 "flag": "PASS", 6648 "comment": [ 6649 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6650 ], 6651 }, 6652 { 6653 "type": "contains", 6654 "value": "MODERATE", 6655 "score": 3, 6656 "flag": "PASS", 6657 "comment": [ 6658 "A non-disruptive variant that might change protein effectiveness" 6659 ], 6660 }, 6661 { 6662 "type": "contains", 6663 "value": "LOW", 6664 "score": 0, 6665 "flag": "FILTERED", 6666 "comment": [ 6667 "Assumed to be mostly harmless or unlikely to change protein behavior" 6668 ], 6669 }, 6670 { 6671 "type": "contains", 6672 "value": "MODIFIER", 6673 "score": 0, 6674 "flag": "FILTERED", 6675 "comment": [ 6676 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6677 ], 6678 }, 6679 ], 6680 } 6681 }, 6682 } 6683 6684 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6686 def get_config_json( 6687 self, name: str, config_dict: dict = {}, config_file: str = None 6688 ) -> dict: 6689 """ 6690 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6691 default values, a dictionary, and a file. 6692 6693 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6694 the name of the configuration. It is used to identify and retrieve the configuration settings 6695 for a specific component or module 6696 :type name: str 6697 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6698 dictionary that allows you to provide additional configuration settings or overrides. When you 6699 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6700 the key is the configuration setting you want to override or 6701 :type config_dict: dict 6702 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6703 specify the path to a configuration file that contains additional settings. If provided, the 6704 function will read the contents of this file and update the configuration dictionary with the 6705 values found in the file, overriding any existing values with the 6706 :type config_file: str 6707 :return: The function `get_config_json` returns a dictionary containing the configuration 6708 settings. 6709 """ 6710 6711 # Create with default prioritizations 6712 config_default = self.get_config_default(name=name) 6713 configuration = config_default 6714 # log.debug(f"configuration={configuration}") 6715 6716 # Replace prioritizations from dict 6717 for config in config_dict: 6718 configuration[config] = config_dict[config] 6719 6720 # Replace prioritizations from file 6721 config_file = full_path(config_file) 6722 if config_file: 6723 if os.path.exists(config_file): 6724 with open(config_file) as config_file_content: 6725 config_file_dict = json.load(config_file_content) 6726 for config in config_file_dict: 6727 configuration[config] = config_file_dict[config] 6728 else: 6729 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6730 log.error(msg_error) 6731 raise ValueError(msg_error) 6732 6733 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6735 def prioritization( 6736 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6737 ) -> bool: 6738 """ 6739 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 6740 prioritizes variants based on configured profiles and criteria. 6741 6742 :param table: The `table` parameter in the `prioritization` function is used to specify the name 6743 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 6744 a table name is provided, the method will prioritize the variants in that specific table 6745 :type table: str 6746 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 6747 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 6748 provided, the code will use a default prefix value of "PZ" 6749 :type pz_prefix: str 6750 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 6751 additional parameters specific to the prioritization process. These parameters can include 6752 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 6753 configurations needed for the prioritization of variants in a V 6754 :type pz_param: dict 6755 :return: A boolean value (True) is being returned from the `prioritization` function. 6756 """ 6757 6758 # Config 6759 config = self.get_config() 6760 6761 # Param 6762 param = self.get_param() 6763 6764 # Prioritization param 6765 if pz_param is not None: 6766 prioritization_param = pz_param 6767 else: 6768 prioritization_param = param.get("prioritization", {}) 6769 6770 # Configuration profiles 6771 prioritization_config_file = prioritization_param.get( 6772 "prioritization_config", None 6773 ) 6774 prioritization_config_file = full_path(prioritization_config_file) 6775 prioritizations_config = self.get_config_json( 6776 name="prioritizations", config_file=prioritization_config_file 6777 ) 6778 6779 # Prioritization prefix 6780 pz_prefix_default = "PZ" 6781 if pz_prefix is None: 6782 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 6783 6784 # Prioritization options 6785 profiles = prioritization_param.get("profiles", []) 6786 if isinstance(profiles, str): 6787 profiles = profiles.split(",") 6788 pzfields = prioritization_param.get( 6789 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 6790 ) 6791 if isinstance(pzfields, str): 6792 pzfields = pzfields.split(",") 6793 default_profile = prioritization_param.get("default_profile", None) 6794 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 6795 prioritization_score_mode = prioritization_param.get( 6796 "prioritization_score_mode", "HOWARD" 6797 ) 6798 6799 # Quick Prioritizations 6800 prioritizations = param.get("prioritizations", None) 6801 if prioritizations: 6802 log.info("Quick Prioritization:") 6803 for profile in prioritizations.split(","): 6804 if profile not in profiles: 6805 profiles.append(profile) 6806 log.info(f" {profile}") 6807 6808 # If profile "ALL" provided, all profiles in the config profiles 6809 if "ALL" in profiles: 6810 profiles = list(prioritizations_config.keys()) 6811 6812 for profile in profiles: 6813 if prioritizations_config.get(profile, None): 6814 log.debug(f"Profile '{profile}' configured") 6815 else: 6816 msg_error = f"Profile '{profile}' NOT configured" 6817 log.error(msg_error) 6818 raise ValueError(msg_error) 6819 6820 if profiles: 6821 log.info(f"Prioritization... ") 6822 else: 6823 log.debug(f"No profile defined") 6824 return False 6825 6826 if not default_profile and len(profiles): 6827 default_profile = profiles[0] 6828 6829 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6830 log.debug("Profiles to check: " + str(list(profiles))) 6831 6832 # Variables 6833 if table is not None: 6834 table_variants = table 6835 else: 6836 table_variants = self.get_table_variants(clause="update") 6837 log.debug(f"Table to prioritize: {table_variants}") 6838 6839 # Added columns 6840 added_columns = [] 6841 6842 # Create list of PZfields 6843 # List of PZFields 6844 list_of_pzfields_original = pzfields + [ 6845 pzfield + pzfields_sep + profile 6846 for pzfield in pzfields 6847 for profile in profiles 6848 ] 6849 list_of_pzfields = [] 6850 log.debug(f"{list_of_pzfields_original}") 6851 6852 # Remove existing PZfields to use if exists 6853 for pzfield in list_of_pzfields_original: 6854 if self.get_header().infos.get(pzfield, None) is None: 6855 list_of_pzfields.append(pzfield) 6856 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6857 else: 6858 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6859 6860 if list_of_pzfields: 6861 6862 # Explode Infos prefix 6863 explode_infos_prefix = self.get_explode_infos_prefix() 6864 6865 # PZfields tags description 6866 PZfields_INFOS = { 6867 f"{pz_prefix}Tags": { 6868 "ID": f"{pz_prefix}Tags", 6869 "Number": ".", 6870 "Type": "String", 6871 "Description": "Variant tags based on annotation criteria", 6872 }, 6873 f"{pz_prefix}Score": { 6874 "ID": f"{pz_prefix}Score", 6875 "Number": 1, 6876 "Type": "Integer", 6877 "Description": "Variant score based on annotation criteria", 6878 }, 6879 f"{pz_prefix}Flag": { 6880 "ID": f"{pz_prefix}Flag", 6881 "Number": 1, 6882 "Type": "String", 6883 "Description": "Variant flag based on annotation criteria", 6884 }, 6885 f"{pz_prefix}Comment": { 6886 "ID": f"{pz_prefix}Comment", 6887 "Number": ".", 6888 "Type": "String", 6889 "Description": "Variant comment based on annotation criteria", 6890 }, 6891 f"{pz_prefix}Infos": { 6892 "ID": f"{pz_prefix}Infos", 6893 "Number": ".", 6894 "Type": "String", 6895 "Description": "Variant infos based on annotation criteria", 6896 }, 6897 } 6898 6899 # Create INFO fields if not exist 6900 for field in PZfields_INFOS: 6901 field_ID = PZfields_INFOS[field]["ID"] 6902 field_description = PZfields_INFOS[field]["Description"] 6903 if field_ID not in self.get_header().infos and field_ID in pzfields: 6904 field_description = ( 6905 PZfields_INFOS[field]["Description"] 6906 + f", profile {default_profile}" 6907 ) 6908 self.get_header().infos[field_ID] = vcf.parser._Info( 6909 field_ID, 6910 PZfields_INFOS[field]["Number"], 6911 PZfields_INFOS[field]["Type"], 6912 field_description, 6913 "unknown", 6914 "unknown", 6915 code_type_map[PZfields_INFOS[field]["Type"]], 6916 ) 6917 6918 # Create INFO fields if not exist for each profile 6919 for profile in prioritizations_config: 6920 if profile in profiles or profiles == []: 6921 for field in PZfields_INFOS: 6922 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6923 field_description = ( 6924 PZfields_INFOS[field]["Description"] 6925 + f", profile {profile}" 6926 ) 6927 if ( 6928 field_ID not in self.get_header().infos 6929 and field in pzfields 6930 ): 6931 self.get_header().infos[field_ID] = vcf.parser._Info( 6932 field_ID, 6933 PZfields_INFOS[field]["Number"], 6934 PZfields_INFOS[field]["Type"], 6935 field_description, 6936 "unknown", 6937 "unknown", 6938 code_type_map[PZfields_INFOS[field]["Type"]], 6939 ) 6940 6941 # Header 6942 for pzfield in list_of_pzfields: 6943 if re.match(f"{pz_prefix}Score.*", pzfield): 6944 added_column = self.add_column( 6945 table_name=table_variants, 6946 column_name=pzfield, 6947 column_type="INTEGER", 6948 default_value="0", 6949 ) 6950 elif re.match(f"{pz_prefix}Flag.*", pzfield): 6951 added_column = self.add_column( 6952 table_name=table_variants, 6953 column_name=pzfield, 6954 column_type="BOOLEAN", 6955 default_value="1", 6956 ) 6957 else: 6958 added_column = self.add_column( 6959 table_name=table_variants, 6960 column_name=pzfield, 6961 column_type="STRING", 6962 default_value="''", 6963 ) 6964 added_columns.append(added_column) 6965 6966 # Profiles 6967 if profiles: 6968 6969 # foreach profile in configuration file 6970 for profile in prioritizations_config: 6971 6972 # If profile is asked in param, or ALL are asked (empty profile []) 6973 if profile in profiles or profiles == []: 6974 log.info(f"Profile '{profile}'") 6975 6976 sql_set_info_option = "" 6977 6978 sql_set_info = [] 6979 6980 # PZ fields set 6981 6982 # PZScore 6983 if ( 6984 f"{pz_prefix}Score{pzfields_sep}{profile}" 6985 in list_of_pzfields 6986 ): 6987 sql_set_info.append( 6988 f""" 6989 concat( 6990 '{pz_prefix}Score{pzfields_sep}{profile}=', 6991 {pz_prefix}Score{pzfields_sep}{profile} 6992 ) 6993 """ 6994 ) 6995 if ( 6996 profile == default_profile 6997 and f"{pz_prefix}Score" in list_of_pzfields 6998 ): 6999 sql_set_info.append( 7000 f""" 7001 concat( 7002 '{pz_prefix}Score=', 7003 {pz_prefix}Score{pzfields_sep}{profile} 7004 ) 7005 """ 7006 ) 7007 7008 # PZFlag 7009 if ( 7010 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7011 in list_of_pzfields 7012 ): 7013 sql_set_info.append( 7014 f""" 7015 concat( 7016 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7017 CASE 7018 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7019 THEN 'PASS' 7020 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7021 THEN 'FILTERED' 7022 END 7023 ) 7024 """ 7025 ) 7026 if ( 7027 profile == default_profile 7028 and f"{pz_prefix}Flag" in list_of_pzfields 7029 ): 7030 sql_set_info.append( 7031 f""" 7032 concat( 7033 '{pz_prefix}Flag=', 7034 CASE 7035 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7036 THEN 'PASS' 7037 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7038 THEN 'FILTERED' 7039 END 7040 ) 7041 """ 7042 ) 7043 7044 # PZComment 7045 if ( 7046 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7047 in list_of_pzfields 7048 ): 7049 sql_set_info.append( 7050 f""" 7051 CASE 7052 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7053 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7054 ELSE '' 7055 END 7056 """ 7057 ) 7058 if ( 7059 profile == default_profile 7060 and f"{pz_prefix}Comment" in list_of_pzfields 7061 ): 7062 sql_set_info.append( 7063 f""" 7064 CASE 7065 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7066 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7067 ELSE '' 7068 END 7069 """ 7070 ) 7071 7072 # PZInfos 7073 if ( 7074 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7075 in list_of_pzfields 7076 ): 7077 sql_set_info.append( 7078 f""" 7079 CASE 7080 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7081 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7082 ELSE '' 7083 END 7084 """ 7085 ) 7086 if ( 7087 profile == default_profile 7088 and f"{pz_prefix}Infos" in list_of_pzfields 7089 ): 7090 sql_set_info.append( 7091 f""" 7092 CASE 7093 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7094 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7095 ELSE '' 7096 END 7097 """ 7098 ) 7099 7100 # Merge PZfields 7101 sql_set_info_option = "" 7102 sql_set_sep = "" 7103 for sql_set in sql_set_info: 7104 if sql_set_sep: 7105 sql_set_info_option += f""" 7106 , concat('{sql_set_sep}', {sql_set}) 7107 """ 7108 else: 7109 sql_set_info_option += f""" 7110 , {sql_set} 7111 """ 7112 sql_set_sep = ";" 7113 7114 sql_queries = [] 7115 for annotation in prioritizations_config[profile]: 7116 7117 # Explode specific annotation 7118 log.debug(f"Explode annotation '{annotation}'") 7119 added_columns += self.explode_infos( 7120 prefix=explode_infos_prefix, 7121 fields=[annotation], 7122 table=table_variants, 7123 ) 7124 extra_infos = self.get_extra_infos(table=table_variants) 7125 7126 # Check if annotation field is present 7127 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 7128 log.debug(f"Annotation '{annotation}' not in data") 7129 continue 7130 else: 7131 log.debug(f"Annotation '{annotation}' in data") 7132 7133 # For each criterions 7134 for criterion in prioritizations_config[profile][ 7135 annotation 7136 ]: 7137 criterion_type = criterion["type"] 7138 criterion_value = criterion["value"] 7139 criterion_score = criterion.get("score", 0) 7140 criterion_flag = criterion.get("flag", "PASS") 7141 criterion_flag_bool = criterion_flag == "PASS" 7142 criterion_comment = ( 7143 ", ".join(criterion.get("comment", [])) 7144 .replace("'", "''") 7145 .replace(";", ",") 7146 .replace("\t", " ") 7147 ) 7148 criterion_infos = ( 7149 str(criterion) 7150 .replace("'", "''") 7151 .replace(";", ",") 7152 .replace("\t", " ") 7153 ) 7154 7155 sql_set = [] 7156 sql_set_info = [] 7157 7158 # PZ fields set 7159 if ( 7160 f"{pz_prefix}Score{pzfields_sep}{profile}" 7161 in list_of_pzfields 7162 ): 7163 if prioritization_score_mode == "HOWARD": 7164 sql_set.append( 7165 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7166 ) 7167 elif prioritization_score_mode == "VaRank": 7168 sql_set.append( 7169 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7170 ) 7171 else: 7172 sql_set.append( 7173 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7174 ) 7175 if ( 7176 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7177 in list_of_pzfields 7178 ): 7179 sql_set.append( 7180 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7181 ) 7182 if ( 7183 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7184 in list_of_pzfields 7185 ): 7186 sql_set.append( 7187 f""" 7188 {pz_prefix}Comment{pzfields_sep}{profile} = 7189 concat( 7190 {pz_prefix}Comment{pzfields_sep}{profile}, 7191 CASE 7192 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7193 THEN ', ' 7194 ELSE '' 7195 END, 7196 '{criterion_comment}' 7197 ) 7198 """ 7199 ) 7200 if ( 7201 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7202 in list_of_pzfields 7203 ): 7204 sql_set.append( 7205 f""" 7206 {pz_prefix}Infos{pzfields_sep}{profile} = 7207 concat( 7208 {pz_prefix}Infos{pzfields_sep}{profile}, 7209 '{criterion_infos}' 7210 ) 7211 """ 7212 ) 7213 sql_set_option = ",".join(sql_set) 7214 7215 # Criterion and comparison 7216 if sql_set_option: 7217 try: 7218 float(criterion_value) 7219 sql_update = f""" 7220 UPDATE {table_variants} 7221 SET {sql_set_option} 7222 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7223 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7224 """ 7225 except: 7226 contains_option = "" 7227 if criterion_type == "contains": 7228 contains_option = ".*" 7229 sql_update = f""" 7230 UPDATE {table_variants} 7231 SET {sql_set_option} 7232 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7233 """ 7234 sql_queries.append(sql_update) 7235 else: 7236 log.warning( 7237 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7238 ) 7239 7240 # PZTags 7241 if ( 7242 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7243 in list_of_pzfields 7244 ): 7245 7246 # Create PZFalgs value 7247 pztags_value = "" 7248 pztags_sep_default = "|" 7249 pztags_sep = "" 7250 for pzfield in pzfields: 7251 if pzfield not in [f"{pz_prefix}Tags"]: 7252 if ( 7253 f"{pzfield}{pzfields_sep}{profile}" 7254 in list_of_pzfields 7255 ): 7256 if pzfield in [f"{pz_prefix}Flag"]: 7257 pztags_value += f"""{pztags_sep}{pzfield}#', 7258 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7259 THEN 'PASS' 7260 ELSE 'FILTERED' 7261 END, '""" 7262 else: 7263 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7264 pztags_sep = pztags_sep_default 7265 7266 # Add Query update for PZFlags 7267 sql_update_pztags = f""" 7268 UPDATE {table_variants} 7269 SET INFO = concat( 7270 INFO, 7271 CASE WHEN INFO NOT in ('','.') 7272 THEN ';' 7273 ELSE '' 7274 END, 7275 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7276 ) 7277 """ 7278 sql_queries.append(sql_update_pztags) 7279 7280 # Add Query update for PZFlags for default 7281 if profile == default_profile: 7282 sql_update_pztags_default = f""" 7283 UPDATE {table_variants} 7284 SET INFO = concat( 7285 INFO, 7286 ';', 7287 '{pz_prefix}Tags={pztags_value}' 7288 ) 7289 """ 7290 sql_queries.append(sql_update_pztags_default) 7291 7292 log.info(f"""Profile '{profile}' - Prioritization... """) 7293 7294 if sql_queries: 7295 7296 for sql_query in sql_queries: 7297 log.debug( 7298 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7299 ) 7300 self.conn.execute(sql_query) 7301 7302 log.info(f"""Profile '{profile}' - Update... """) 7303 sql_query_update = f""" 7304 UPDATE {table_variants} 7305 SET INFO = 7306 concat( 7307 CASE 7308 WHEN INFO NOT IN ('','.') 7309 THEN concat(INFO, ';') 7310 ELSE '' 7311 END 7312 {sql_set_info_option} 7313 ) 7314 """ 7315 self.conn.execute(sql_query_update) 7316 7317 else: 7318 7319 log.warning(f"No profiles in parameters") 7320 7321 # Remove added columns 7322 for added_column in added_columns: 7323 self.drop_column(column=added_column) 7324 7325 # Explode INFOS fields into table fields 7326 if self.get_explode_infos(): 7327 self.explode_infos( 7328 prefix=self.get_explode_infos_prefix(), 7329 fields=self.get_explode_infos_fields(), 7330 force=True, 7331 ) 7332 7333 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7339 def annotation_hgvs(self, threads: int = None) -> None: 7340 """ 7341 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7342 coordinates and alleles. 7343 7344 :param threads: The `threads` parameter is an optional integer that specifies the number of 7345 threads to use for parallel processing. If no value is provided, it will default to the number 7346 of threads obtained from the `get_threads()` method 7347 :type threads: int 7348 """ 7349 7350 # Function for each partition of the Dask Dataframe 7351 def partition_function(partition): 7352 """ 7353 The function `partition_function` applies the `annotation_hgvs_partition` function to 7354 each row of a DataFrame called `partition`. 7355 7356 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7357 to be processed 7358 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7359 the "partition" dataframe along the axis 1. 7360 """ 7361 return partition.apply(annotation_hgvs_partition, axis=1) 7362 7363 def annotation_hgvs_partition(row) -> str: 7364 """ 7365 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7366 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7367 7368 :param row: A dictionary-like object that contains the values for the following keys: 7369 :return: a string that contains the HGVS names associated with the given row of data. 7370 """ 7371 7372 chr = row["CHROM"] 7373 pos = row["POS"] 7374 ref = row["REF"] 7375 alt = row["ALT"] 7376 7377 # Find list of associated transcripts 7378 transcripts_list = list( 7379 polars_conn.execute( 7380 f""" 7381 SELECT transcript 7382 FROM refseq_df 7383 WHERE CHROM='{chr}' 7384 AND POS={pos} 7385 """ 7386 )["transcript"] 7387 ) 7388 7389 # Full HGVS annotation in list 7390 hgvs_full_list = [] 7391 7392 for transcript_name in transcripts_list: 7393 7394 # Transcript 7395 transcript = get_transcript( 7396 transcripts=transcripts, transcript_name=transcript_name 7397 ) 7398 # Exon 7399 if use_exon: 7400 exon = transcript.find_exon_number(pos) 7401 else: 7402 exon = None 7403 # Protein 7404 transcript_protein = None 7405 if use_protein or add_protein or full_format: 7406 transcripts_protein = list( 7407 polars_conn.execute( 7408 f""" 7409 SELECT protein 7410 FROM refseqlink_df 7411 WHERE transcript='{transcript_name}' 7412 LIMIT 1 7413 """ 7414 )["protein"] 7415 ) 7416 if len(transcripts_protein): 7417 transcript_protein = transcripts_protein[0] 7418 7419 # HGVS name 7420 hgvs_name = format_hgvs_name( 7421 chr, 7422 pos, 7423 ref, 7424 alt, 7425 genome=genome, 7426 transcript=transcript, 7427 transcript_protein=transcript_protein, 7428 exon=exon, 7429 use_gene=use_gene, 7430 use_protein=use_protein, 7431 full_format=full_format, 7432 use_version=use_version, 7433 codon_type=codon_type, 7434 ) 7435 hgvs_full_list.append(hgvs_name) 7436 if add_protein and not use_protein and not full_format: 7437 hgvs_name = format_hgvs_name( 7438 chr, 7439 pos, 7440 ref, 7441 alt, 7442 genome=genome, 7443 transcript=transcript, 7444 transcript_protein=transcript_protein, 7445 exon=exon, 7446 use_gene=use_gene, 7447 use_protein=True, 7448 full_format=False, 7449 use_version=use_version, 7450 codon_type=codon_type, 7451 ) 7452 hgvs_full_list.append(hgvs_name) 7453 7454 # Create liste of HGVS annotations 7455 hgvs_full = ",".join(hgvs_full_list) 7456 7457 return hgvs_full 7458 7459 # Polars connexion 7460 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7461 7462 # Config 7463 config = self.get_config() 7464 7465 # Databases 7466 # Genome 7467 databases_genomes_folders = ( 7468 config.get("folders", {}) 7469 .get("databases", {}) 7470 .get("genomes", DEFAULT_GENOME_FOLDER) 7471 ) 7472 databases_genome = ( 7473 config.get("folders", {}).get("databases", {}).get("genomes", "") 7474 ) 7475 # refseq database folder 7476 databases_refseq_folders = ( 7477 config.get("folders", {}) 7478 .get("databases", {}) 7479 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7480 ) 7481 # refseq 7482 databases_refseq = config.get("databases", {}).get("refSeq", None) 7483 # refSeqLink 7484 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7485 7486 # Param 7487 param = self.get_param() 7488 7489 # Quick HGVS 7490 if "hgvs_options" in param and param.get("hgvs_options", ""): 7491 log.info(f"Quick HGVS Annotation:") 7492 if not param.get("hgvs", None): 7493 param["hgvs"] = {} 7494 for option in param.get("hgvs_options", "").split(","): 7495 option_var_val = option.split("=") 7496 option_var = option_var_val[0] 7497 if len(option_var_val) > 1: 7498 option_val = option_var_val[1] 7499 else: 7500 option_val = "True" 7501 if option_val.upper() in ["TRUE"]: 7502 option_val = True 7503 elif option_val.upper() in ["FALSE"]: 7504 option_val = False 7505 log.info(f" {option_var}={option_val}") 7506 param["hgvs"][option_var] = option_val 7507 7508 # Check if HGVS annotation enabled 7509 if "hgvs" in param: 7510 log.info(f"HGVS Annotation... ") 7511 for hgvs_option in param.get("hgvs", {}): 7512 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7513 else: 7514 return 7515 7516 # HGVS Param 7517 param_hgvs = param.get("hgvs", {}) 7518 use_exon = param_hgvs.get("use_exon", False) 7519 use_gene = param_hgvs.get("use_gene", False) 7520 use_protein = param_hgvs.get("use_protein", False) 7521 add_protein = param_hgvs.get("add_protein", False) 7522 full_format = param_hgvs.get("full_format", False) 7523 use_version = param_hgvs.get("use_version", False) 7524 codon_type = param_hgvs.get("codon_type", "3") 7525 7526 # refSseq refSeqLink 7527 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7528 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7529 7530 # Assembly 7531 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7532 7533 # Genome 7534 genome_file = None 7535 if find_genome(databases_genome): 7536 genome_file = find_genome(databases_genome) 7537 else: 7538 genome_file = find_genome( 7539 genome_path=databases_genomes_folders, assembly=assembly 7540 ) 7541 log.debug("Genome: " + str(genome_file)) 7542 7543 # refSseq 7544 refseq_file = find_file_prefix( 7545 input_file=databases_refseq, 7546 prefix="ncbiRefSeq", 7547 folder=databases_refseq_folders, 7548 assembly=assembly, 7549 ) 7550 log.debug("refSeq: " + str(refseq_file)) 7551 7552 # refSeqLink 7553 refseqlink_file = find_file_prefix( 7554 input_file=databases_refseqlink, 7555 prefix="ncbiRefSeqLink", 7556 folder=databases_refseq_folders, 7557 assembly=assembly, 7558 ) 7559 log.debug("refSeqLink: " + str(refseqlink_file)) 7560 7561 # Threads 7562 if not threads: 7563 threads = self.get_threads() 7564 log.debug("Threads: " + str(threads)) 7565 7566 # Variables 7567 table_variants = self.get_table_variants(clause="update") 7568 7569 # Get variants SNV and InDel only 7570 query_variants = f""" 7571 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7572 FROM {table_variants} 7573 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7574 """ 7575 df_variants = self.get_query_to_df(query_variants) 7576 7577 # Added columns 7578 added_columns = [] 7579 7580 # Add hgvs column in variants table 7581 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7582 added_column = self.add_column( 7583 table_variants, hgvs_column_name, "STRING", default_value=None 7584 ) 7585 added_columns.append(added_column) 7586 7587 log.debug(f"refSeq loading...") 7588 # refSeq in duckDB 7589 refseq_table = get_refseq_table( 7590 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7591 ) 7592 # Loading all refSeq in Dataframe 7593 refseq_query = f""" 7594 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7595 FROM {refseq_table} 7596 JOIN df_variants ON ( 7597 {refseq_table}.chrom = df_variants.CHROM 7598 AND {refseq_table}.txStart<=df_variants.POS 7599 AND {refseq_table}.txEnd>=df_variants.POS 7600 ) 7601 """ 7602 refseq_df = self.conn.query(refseq_query).pl() 7603 7604 if refseqlink_file: 7605 log.debug(f"refSeqLink loading...") 7606 # refSeqLink in duckDB 7607 refseqlink_table = get_refseq_table( 7608 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7609 ) 7610 # Loading all refSeqLink in Dataframe 7611 protacc_column = "protAcc_with_ver" 7612 mrnaacc_column = "mrnaAcc_with_ver" 7613 refseqlink_query = f""" 7614 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7615 FROM {refseqlink_table} 7616 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7617 WHERE protAcc_without_ver IS NOT NULL 7618 """ 7619 # Polars Dataframe 7620 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7621 7622 # Read RefSeq transcripts into a python dict/model. 7623 log.debug(f"Transcripts loading...") 7624 with tempfile.TemporaryDirectory() as tmpdir: 7625 transcripts_query = f""" 7626 COPY ( 7627 SELECT {refseq_table}.* 7628 FROM {refseq_table} 7629 JOIN df_variants ON ( 7630 {refseq_table}.chrom=df_variants.CHROM 7631 AND {refseq_table}.txStart<=df_variants.POS 7632 AND {refseq_table}.txEnd>=df_variants.POS 7633 ) 7634 ) 7635 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7636 """ 7637 self.conn.query(transcripts_query) 7638 with open(f"{tmpdir}/transcript.tsv") as infile: 7639 transcripts = read_transcripts(infile) 7640 7641 # Polars connexion 7642 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7643 7644 log.debug("Genome loading...") 7645 # Read genome sequence using pyfaidx. 7646 genome = Fasta(genome_file) 7647 7648 log.debug("Start annotation HGVS...") 7649 7650 # Create 7651 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7652 ddf = dd.from_pandas(df_variants, npartitions=threads) 7653 7654 # Use dask.dataframe.apply() to apply function on each partition 7655 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7656 7657 # Convert Dask DataFrame to Pandas Dataframe 7658 df = ddf.compute() 7659 7660 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7661 with tempfile.TemporaryDirectory() as tmpdir: 7662 df_parquet = os.path.join(tmpdir, "df.parquet") 7663 df.to_parquet(df_parquet) 7664 7665 # Update hgvs column 7666 update_variant_query = f""" 7667 UPDATE {table_variants} 7668 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7669 FROM read_parquet('{df_parquet}') as df 7670 WHERE variants."#CHROM" = df.CHROM 7671 AND variants.POS = df.POS 7672 AND variants.REF = df.REF 7673 AND variants.ALT = df.ALT 7674 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7675 """ 7676 self.execute_query(update_variant_query) 7677 7678 # Update INFO column 7679 sql_query_update = f""" 7680 UPDATE {table_variants} 7681 SET INFO = 7682 concat( 7683 CASE 7684 WHEN INFO NOT IN ('','.') 7685 THEN concat(INFO, ';') 7686 ELSE '' 7687 END, 7688 'hgvs=', 7689 {hgvs_column_name} 7690 ) 7691 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7692 """ 7693 self.execute_query(sql_query_update) 7694 7695 # Add header 7696 HGVS_INFOS = { 7697 "hgvs": { 7698 "ID": "hgvs", 7699 "Number": ".", 7700 "Type": "String", 7701 "Description": f"HGVS annotatation with HOWARD", 7702 } 7703 } 7704 7705 for field in HGVS_INFOS: 7706 field_ID = HGVS_INFOS[field]["ID"] 7707 field_description = HGVS_INFOS[field]["Description"] 7708 self.get_header().infos[field_ID] = vcf.parser._Info( 7709 field_ID, 7710 HGVS_INFOS[field]["Number"], 7711 HGVS_INFOS[field]["Type"], 7712 field_description, 7713 "unknown", 7714 "unknown", 7715 code_type_map[HGVS_INFOS[field]["Type"]], 7716 ) 7717 7718 # Remove added columns 7719 for added_column in added_columns: 7720 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
7726 def get_operations_help( 7727 self, operations_config_dict: dict = {}, operations_config_file: str = None 7728 ) -> list: 7729 7730 # Init 7731 operations_help = [] 7732 7733 # operations 7734 operations = self.get_config_json( 7735 name="calculations", 7736 config_dict=operations_config_dict, 7737 config_file=operations_config_file, 7738 ) 7739 for op in operations: 7740 op_name = operations[op].get("name", op).upper() 7741 op_description = operations[op].get("description", op_name) 7742 op_available = operations[op].get("available", False) 7743 if op_available: 7744 operations_help.append(f" {op_name}: {op_description}") 7745 7746 # Sort operations 7747 operations_help.sort() 7748 7749 # insert header 7750 operations_help.insert(0, "Available calculation operations:") 7751 7752 # Return 7753 return operations_help
7755 def calculation( 7756 self, 7757 operations: dict = {}, 7758 operations_config_dict: dict = {}, 7759 operations_config_file: str = None, 7760 ) -> None: 7761 """ 7762 It takes a list of operations, and for each operation, it checks if it's a python or sql 7763 operation, and then calls the appropriate function 7764 7765 param json example: 7766 "calculation": { 7767 "NOMEN": { 7768 "options": { 7769 "hgvs_field": "hgvs" 7770 }, 7771 "middle" : null 7772 } 7773 """ 7774 7775 # Param 7776 param = self.get_param() 7777 7778 # operations config 7779 operations_config = self.get_config_json( 7780 name="calculations", 7781 config_dict=operations_config_dict, 7782 config_file=operations_config_file, 7783 ) 7784 7785 # Upper keys 7786 operations_config = {k.upper(): v for k, v in operations_config.items()} 7787 7788 # Calculations 7789 7790 # Operations from param 7791 operations = param.get("calculation", {}).get("calculations", operations) 7792 7793 # Quick calculation - add 7794 if param.get("calculations", None): 7795 calculations_list = [ 7796 value for value in param.get("calculations", "").split(",") 7797 ] 7798 log.info(f"Quick Calculations:") 7799 for calculation_key in calculations_list: 7800 log.info(f" {calculation_key}") 7801 for calculation_operation in calculations_list: 7802 if calculation_operation.upper() not in operations: 7803 operations[calculation_operation.upper()] = {} 7804 add_value_into_dict( 7805 dict_tree=param, 7806 sections=[ 7807 "calculation", 7808 "calculations", 7809 calculation_operation.upper(), 7810 ], 7811 value={}, 7812 ) 7813 7814 # Operations for calculation 7815 if not operations: 7816 operations = param.get("calculation", {}).get("calculations", {}) 7817 7818 if operations: 7819 log.info(f"Calculations...") 7820 7821 # For each operations 7822 for operation_name in operations: 7823 operation_name = operation_name.upper() 7824 if operation_name not in [""]: 7825 if operation_name in operations_config: 7826 log.info(f"Calculation '{operation_name}'") 7827 operation = operations_config[operation_name] 7828 operation_type = operation.get("type", "sql") 7829 if operation_type == "python": 7830 self.calculation_process_function( 7831 operation=operation, operation_name=operation_name 7832 ) 7833 elif operation_type == "sql": 7834 self.calculation_process_sql( 7835 operation=operation, operation_name=operation_name 7836 ) 7837 else: 7838 log.error( 7839 f"Operations config: Type '{operation_type}' NOT available" 7840 ) 7841 raise ValueError( 7842 f"Operations config: Type '{operation_type}' NOT available" 7843 ) 7844 else: 7845 log.error( 7846 f"Operations config: Calculation '{operation_name}' NOT available" 7847 ) 7848 raise ValueError( 7849 f"Operations config: Calculation '{operation_name}' NOT available" 7850 ) 7851 7852 # Explode INFOS fields into table fields 7853 if self.get_explode_infos(): 7854 self.explode_infos( 7855 prefix=self.get_explode_infos_prefix(), 7856 fields=self.get_explode_infos_fields(), 7857 force=True, 7858 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
7860 def calculation_process_sql( 7861 self, operation: dict, operation_name: str = "unknown" 7862 ) -> None: 7863 """ 7864 The `calculation_process_sql` function takes in a mathematical operation as a string and 7865 performs the operation, updating the specified table with the result. 7866 7867 :param operation: The `operation` parameter is a dictionary that contains information about the 7868 mathematical operation to be performed. It includes the following keys: 7869 :type operation: dict 7870 :param operation_name: The `operation_name` parameter is a string that represents the name of 7871 the mathematical operation being performed. It is used for logging and error handling purposes, 7872 defaults to unknown 7873 :type operation_name: str (optional) 7874 """ 7875 7876 # table variants 7877 table_variants = self.get_table_variants(clause="alter") 7878 7879 # Operation infos 7880 operation_name = operation.get("name", "unknown") 7881 log.debug(f"process sql {operation_name}") 7882 output_column_name = operation.get("output_column_name", operation_name) 7883 output_column_type = operation.get("output_column_type", "String") 7884 prefix = operation.get("explode_infos_prefix", "") 7885 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7886 output_column_description = operation.get( 7887 "output_column_description", f"{operation_name} operation" 7888 ) 7889 operation_query = operation.get("operation_query", None) 7890 if isinstance(operation_query, list): 7891 operation_query = " ".join(operation_query) 7892 operation_info_fields = operation.get("info_fields", []) 7893 operation_info_fields_check = operation.get("info_fields_check", False) 7894 operation_info = operation.get("operation_info", True) 7895 7896 if operation_query: 7897 7898 # Info fields check 7899 operation_info_fields_check_result = True 7900 if operation_info_fields_check: 7901 header_infos = self.get_header().infos 7902 for info_field in operation_info_fields: 7903 operation_info_fields_check_result = ( 7904 operation_info_fields_check_result 7905 and info_field in header_infos 7906 ) 7907 7908 # If info fields available 7909 if operation_info_fields_check_result: 7910 7911 # Added_columns 7912 added_columns = [] 7913 7914 # Create VCF header field 7915 vcf_reader = self.get_header() 7916 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7917 output_column_name, 7918 ".", 7919 output_column_type, 7920 output_column_description, 7921 "howard calculation", 7922 "0", 7923 self.code_type_map.get(output_column_type), 7924 ) 7925 7926 # Explode infos if needed 7927 log.debug(f"calculation_process_sql prefix {prefix}") 7928 added_columns += self.explode_infos( 7929 prefix=prefix, 7930 fields=[output_column_name] + operation_info_fields, 7931 force=True, 7932 ) 7933 7934 # Create column 7935 added_column = self.add_column( 7936 table_name=table_variants, 7937 column_name=prefix + output_column_name, 7938 column_type=output_column_type_sql, 7939 default_value="null", 7940 ) 7941 added_columns.append(added_column) 7942 7943 # Operation calculation 7944 try: 7945 7946 # Query to update calculation column 7947 sql_update = f""" 7948 UPDATE {table_variants} 7949 SET "{prefix}{output_column_name}" = ({operation_query}) 7950 """ 7951 self.conn.execute(sql_update) 7952 7953 # Add to INFO 7954 if operation_info: 7955 sql_update_info = f""" 7956 UPDATE {table_variants} 7957 SET "INFO" = 7958 concat( 7959 CASE 7960 WHEN "INFO" IS NOT NULL 7961 THEN concat("INFO", ';') 7962 ELSE '' 7963 END, 7964 '{output_column_name}=', 7965 "{prefix}{output_column_name}" 7966 ) 7967 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7968 """ 7969 self.conn.execute(sql_update_info) 7970 7971 except: 7972 log.error( 7973 f"Operations config: Calculation '{operation_name}' query failed" 7974 ) 7975 raise ValueError( 7976 f"Operations config: Calculation '{operation_name}' query failed" 7977 ) 7978 7979 # Remove added columns 7980 for added_column in added_columns: 7981 log.debug(f"added_column: {added_column}") 7982 self.drop_column(column=added_column) 7983 7984 else: 7985 log.error( 7986 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7987 ) 7988 raise ValueError( 7989 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7990 ) 7991 7992 else: 7993 log.error( 7994 f"Operations config: Calculation '{operation_name}' query NOT defined" 7995 ) 7996 raise ValueError( 7997 f"Operations config: Calculation '{operation_name}' query NOT defined" 7998 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8000 def calculation_process_function( 8001 self, operation: dict, operation_name: str = "unknown" 8002 ) -> None: 8003 """ 8004 The `calculation_process_function` takes in an operation dictionary and performs the specified 8005 function with the given parameters. 8006 8007 :param operation: The `operation` parameter is a dictionary that contains information about the 8008 operation to be performed. It has the following keys: 8009 :type operation: dict 8010 :param operation_name: The `operation_name` parameter is a string that represents the name of 8011 the operation being performed. It is used for logging purposes, defaults to unknown 8012 :type operation_name: str (optional) 8013 """ 8014 8015 operation_name = operation["name"] 8016 log.debug(f"process sql {operation_name}") 8017 function_name = operation["function_name"] 8018 function_params = operation["function_params"] 8019 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8021 def calculation_variant_id(self) -> None: 8022 """ 8023 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8024 updates the INFO field of a variants table with the variant ID. 8025 """ 8026 8027 # variant_id annotation field 8028 variant_id_tag = self.get_variant_id_column() 8029 added_columns = [variant_id_tag] 8030 8031 # variant_id hgvs tags" 8032 vcf_infos_tags = { 8033 variant_id_tag: "howard variant ID annotation", 8034 } 8035 8036 # Variants table 8037 table_variants = self.get_table_variants() 8038 8039 # Header 8040 vcf_reader = self.get_header() 8041 8042 # Add variant_id to header 8043 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8044 variant_id_tag, 8045 ".", 8046 "String", 8047 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8048 "howard calculation", 8049 "0", 8050 self.code_type_map.get("String"), 8051 ) 8052 8053 # Update 8054 sql_update = f""" 8055 UPDATE {table_variants} 8056 SET "INFO" = 8057 concat( 8058 CASE 8059 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8060 THEN '' 8061 ELSE concat("INFO", ';') 8062 END, 8063 '{variant_id_tag}=', 8064 "{variant_id_tag}" 8065 ) 8066 """ 8067 self.conn.execute(sql_update) 8068 8069 # Remove added columns 8070 for added_column in added_columns: 8071 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8073 def calculation_extract_snpeff_hgvs( 8074 self, 8075 snpeff_hgvs: str = "snpeff_hgvs", 8076 snpeff_field: str = "ANN", 8077 ) -> None: 8078 """ 8079 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8080 annotation field in a VCF file and adds them as a new column in the variants table. 8081 8082 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8083 function is used to specify the name of the column that will store the HGVS nomenclatures 8084 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8085 snpeff_hgvs 8086 :type snpeff_hgvs: str (optional) 8087 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8088 function represents the field in the VCF file that contains SnpEff annotations. This field is 8089 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8090 to ANN 8091 :type snpeff_field: str (optional) 8092 """ 8093 8094 # Snpeff hgvs tags 8095 vcf_infos_tags = { 8096 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8097 } 8098 8099 # Prefix 8100 prefix = self.get_explode_infos_prefix() 8101 if prefix: 8102 prefix = "INFO/" 8103 8104 # snpEff fields 8105 speff_ann_infos = prefix + snpeff_field 8106 speff_hgvs_infos = prefix + snpeff_hgvs 8107 8108 # Variants table 8109 table_variants = self.get_table_variants() 8110 8111 # Header 8112 vcf_reader = self.get_header() 8113 8114 # Add columns 8115 added_columns = [] 8116 8117 # Explode HGVS field in column 8118 added_columns += self.explode_infos(fields=[snpeff_field]) 8119 8120 if snpeff_field in vcf_reader.infos: 8121 8122 log.debug(vcf_reader.infos[snpeff_field]) 8123 8124 # Extract ANN header 8125 ann_description = vcf_reader.infos[snpeff_field].desc 8126 pattern = r"'(.+?)'" 8127 match = re.search(pattern, ann_description) 8128 if match: 8129 ann_header_match = match.group(1).split(" | ") 8130 ann_header_desc = {} 8131 for i in range(len(ann_header_match)): 8132 ann_header_info = "".join( 8133 char for char in ann_header_match[i] if char.isalnum() 8134 ) 8135 ann_header_desc[ann_header_info] = ann_header_match[i] 8136 if not ann_header_desc: 8137 raise ValueError("Invalid header description format") 8138 else: 8139 raise ValueError("Invalid header description format") 8140 8141 # Create variant id 8142 variant_id_column = self.get_variant_id_column() 8143 added_columns += [variant_id_column] 8144 8145 # Create dataframe 8146 dataframe_snpeff_hgvs = self.get_query_to_df( 8147 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8148 ) 8149 8150 # Create main NOMEN column 8151 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8152 speff_ann_infos 8153 ].apply( 8154 lambda x: extract_snpeff_hgvs( 8155 str(x), header=list(ann_header_desc.values()) 8156 ) 8157 ) 8158 8159 # Add snpeff_hgvs to header 8160 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8161 snpeff_hgvs, 8162 ".", 8163 "String", 8164 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8165 "howard calculation", 8166 "0", 8167 self.code_type_map.get("String"), 8168 ) 8169 8170 # Update 8171 sql_update = f""" 8172 UPDATE variants 8173 SET "INFO" = 8174 concat( 8175 CASE 8176 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8177 THEN '' 8178 ELSE concat("INFO", ';') 8179 END, 8180 CASE 8181 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8182 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8183 THEN concat( 8184 '{snpeff_hgvs}=', 8185 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8186 ) 8187 ELSE '' 8188 END 8189 ) 8190 FROM dataframe_snpeff_hgvs 8191 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8192 8193 """ 8194 self.conn.execute(sql_update) 8195 8196 # Delete dataframe 8197 del dataframe_snpeff_hgvs 8198 gc.collect() 8199 8200 else: 8201 8202 log.warning( 8203 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8204 ) 8205 8206 # Remove added columns 8207 for added_column in added_columns: 8208 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8210 def calculation_snpeff_ann_explode( 8211 self, 8212 uniquify: bool = True, 8213 output_format: str = "fields", 8214 output_prefix: str = "snpeff_", 8215 snpeff_field: str = "ANN", 8216 ) -> None: 8217 """ 8218 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8219 exploding the HGVS field and updating variant information accordingly. 8220 8221 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8222 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8223 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8224 defaults to True 8225 :type uniquify: bool (optional) 8226 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8227 function specifies the format in which the output annotations will be generated. It has a 8228 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8229 format, defaults to fields 8230 :type output_format: str (optional) 8231 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8232 method is used to specify the prefix that will be added to the output annotations generated 8233 during the calculation process. This prefix helps to differentiate the newly added annotations 8234 from existing ones in the output data. By default, the, defaults to ANN_ 8235 :type output_prefix: str (optional) 8236 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8237 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8238 field will be processed to explode the HGVS annotations and update the variant information 8239 accordingly, defaults to ANN 8240 :type snpeff_field: str (optional) 8241 """ 8242 8243 # SnpEff annotation field 8244 snpeff_hgvs = "snpeff_ann_explode" 8245 8246 # Snpeff hgvs tags 8247 vcf_infos_tags = { 8248 snpeff_hgvs: "Explode snpEff annotations", 8249 } 8250 8251 # Prefix 8252 prefix = self.get_explode_infos_prefix() 8253 if prefix: 8254 prefix = "INFO/" 8255 8256 # snpEff fields 8257 speff_ann_infos = prefix + snpeff_field 8258 speff_hgvs_infos = prefix + snpeff_hgvs 8259 8260 # Variants table 8261 table_variants = self.get_table_variants() 8262 8263 # Header 8264 vcf_reader = self.get_header() 8265 8266 # Add columns 8267 added_columns = [] 8268 8269 # Explode HGVS field in column 8270 added_columns += self.explode_infos(fields=[snpeff_field]) 8271 log.debug(f"snpeff_field={snpeff_field}") 8272 log.debug(f"added_columns={added_columns}") 8273 8274 if snpeff_field in vcf_reader.infos: 8275 8276 # Extract ANN header 8277 ann_description = vcf_reader.infos[snpeff_field].desc 8278 pattern = r"'(.+?)'" 8279 match = re.search(pattern, ann_description) 8280 if match: 8281 ann_header_match = match.group(1).split(" | ") 8282 ann_header = [] 8283 ann_header_desc = {} 8284 for i in range(len(ann_header_match)): 8285 ann_header_info = "".join( 8286 char for char in ann_header_match[i] if char.isalnum() 8287 ) 8288 ann_header.append(ann_header_info) 8289 ann_header_desc[ann_header_info] = ann_header_match[i] 8290 if not ann_header_desc: 8291 raise ValueError("Invalid header description format") 8292 else: 8293 raise ValueError("Invalid header description format") 8294 8295 # Create variant id 8296 variant_id_column = self.get_variant_id_column() 8297 added_columns += [variant_id_column] 8298 8299 # Create dataframe 8300 dataframe_snpeff_hgvs = self.get_query_to_df( 8301 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8302 ) 8303 8304 # Create snpEff columns 8305 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8306 speff_ann_infos 8307 ].apply( 8308 lambda x: explode_snpeff_ann( 8309 str(x), 8310 uniquify=uniquify, 8311 output_format=output_format, 8312 prefix=output_prefix, 8313 header=list(ann_header_desc.values()), 8314 ) 8315 ) 8316 8317 # Header 8318 ann_annotations_prefix = "" 8319 if output_format.upper() in ["JSON"]: 8320 ann_annotations_prefix = f"{output_prefix}=" 8321 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8322 output_prefix, 8323 ".", 8324 "String", 8325 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8326 + " - JSON format", 8327 "howard calculation", 8328 "0", 8329 self.code_type_map.get("String"), 8330 ) 8331 else: 8332 for ann_annotation in ann_header: 8333 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8334 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8335 ann_annotation_id, 8336 ".", 8337 "String", 8338 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8339 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8340 "howard calculation", 8341 "0", 8342 self.code_type_map.get("String"), 8343 ) 8344 8345 # Update 8346 sql_update = f""" 8347 UPDATE variants 8348 SET "INFO" = 8349 concat( 8350 CASE 8351 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8352 THEN '' 8353 ELSE concat("INFO", ';') 8354 END, 8355 CASE 8356 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8357 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8358 THEN concat( 8359 '{ann_annotations_prefix}', 8360 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8361 ) 8362 ELSE '' 8363 END 8364 ) 8365 FROM dataframe_snpeff_hgvs 8366 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8367 8368 """ 8369 self.conn.execute(sql_update) 8370 8371 # Delete dataframe 8372 del dataframe_snpeff_hgvs 8373 gc.collect() 8374 8375 else: 8376 8377 log.warning( 8378 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8379 ) 8380 8381 # Remove added columns 8382 for added_column in added_columns: 8383 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8385 def calculation_extract_nomen(self) -> None: 8386 """ 8387 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8388 """ 8389 8390 # NOMEN field 8391 field_nomen_dict = "NOMEN_DICT" 8392 8393 # NOMEN structure 8394 nomen_dict = { 8395 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8396 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8397 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8398 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8399 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8400 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8401 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8402 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8403 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8404 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8405 } 8406 8407 # Param 8408 param = self.get_param() 8409 8410 # Prefix 8411 prefix = self.get_explode_infos_prefix() 8412 8413 # Header 8414 vcf_reader = self.get_header() 8415 8416 # Get HGVS field 8417 hgvs_field = ( 8418 param.get("calculation", {}) 8419 .get("calculations", {}) 8420 .get("NOMEN", {}) 8421 .get("options", {}) 8422 .get("hgvs_field", "hgvs") 8423 ) 8424 8425 # Get transcripts 8426 transcripts_file = ( 8427 param.get("calculation", {}) 8428 .get("calculations", {}) 8429 .get("NOMEN", {}) 8430 .get("options", {}) 8431 .get("transcripts", None) 8432 ) 8433 transcripts_file = full_path(transcripts_file) 8434 transcripts = [] 8435 if transcripts_file: 8436 if os.path.exists(transcripts_file): 8437 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8438 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8439 else: 8440 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8441 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8442 8443 # Added columns 8444 added_columns = [] 8445 8446 # Explode HGVS field in column 8447 added_columns += self.explode_infos(fields=[hgvs_field]) 8448 8449 # extra infos 8450 extra_infos = self.get_extra_infos() 8451 extra_field = prefix + hgvs_field 8452 8453 if extra_field in extra_infos: 8454 8455 # Create dataframe 8456 dataframe_hgvs = self.get_query_to_df( 8457 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8458 ) 8459 8460 # Create main NOMEN column 8461 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8462 lambda x: find_nomen(str(x), transcripts=transcripts) 8463 ) 8464 8465 # Explode NOMEN Structure and create SQL set for update 8466 sql_nomen_fields = [] 8467 for nomen_field in nomen_dict: 8468 8469 # Explode each field into a column 8470 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8471 lambda x: dict(x).get(nomen_field, "") 8472 ) 8473 8474 # Create VCF header field 8475 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8476 nomen_field, 8477 ".", 8478 "String", 8479 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8480 "howard calculation", 8481 "0", 8482 self.code_type_map.get("String"), 8483 ) 8484 sql_nomen_fields.append( 8485 f""" 8486 CASE 8487 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8488 THEN concat( 8489 ';{nomen_field}=', 8490 dataframe_hgvs."{nomen_field}" 8491 ) 8492 ELSE '' 8493 END 8494 """ 8495 ) 8496 8497 # SQL set for update 8498 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8499 8500 # Update 8501 sql_update = f""" 8502 UPDATE variants 8503 SET "INFO" = 8504 concat( 8505 CASE 8506 WHEN "INFO" IS NULL 8507 THEN '' 8508 ELSE "INFO" 8509 END, 8510 {sql_nomen_fields_set} 8511 ) 8512 FROM dataframe_hgvs 8513 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8514 AND variants."POS" = dataframe_hgvs."POS" 8515 AND variants."REF" = dataframe_hgvs."REF" 8516 AND variants."ALT" = dataframe_hgvs."ALT" 8517 """ 8518 self.conn.execute(sql_update) 8519 8520 # Delete dataframe 8521 del dataframe_hgvs 8522 gc.collect() 8523 8524 # Remove added columns 8525 for added_column in added_columns: 8526 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8528 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8529 """ 8530 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8531 pipeline/sample for a variant and updates the variant information in a VCF file. 8532 8533 :param tag: The `tag` parameter is a string that represents the annotation field for the 8534 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8535 VCF header and to update the corresponding field in the variants table, defaults to 8536 findbypipeline 8537 :type tag: str (optional) 8538 """ 8539 8540 # if FORMAT and samples 8541 if ( 8542 "FORMAT" in self.get_header_columns_as_list() 8543 and self.get_header_sample_list() 8544 ): 8545 8546 # findbypipeline annotation field 8547 findbypipeline_tag = tag 8548 8549 # VCF infos tags 8550 vcf_infos_tags = { 8551 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8552 } 8553 8554 # Prefix 8555 prefix = self.get_explode_infos_prefix() 8556 8557 # Field 8558 findbypipeline_infos = prefix + findbypipeline_tag 8559 8560 # Variants table 8561 table_variants = self.get_table_variants() 8562 8563 # Header 8564 vcf_reader = self.get_header() 8565 8566 # Create variant id 8567 variant_id_column = self.get_variant_id_column() 8568 added_columns = [variant_id_column] 8569 8570 # variant_id, FORMAT and samples 8571 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8572 self.get_header_sample_list() 8573 ) 8574 8575 # Create dataframe 8576 dataframe_findbypipeline = self.get_query_to_df( 8577 f""" SELECT {samples_fields} FROM {table_variants} """ 8578 ) 8579 8580 # Create findbypipeline column 8581 dataframe_findbypipeline[findbypipeline_infos] = ( 8582 dataframe_findbypipeline.apply( 8583 lambda row: findbypipeline( 8584 row, samples=self.get_header_sample_list() 8585 ), 8586 axis=1, 8587 ) 8588 ) 8589 8590 # Add snpeff_hgvs to header 8591 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8592 findbypipeline_tag, 8593 ".", 8594 "String", 8595 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8596 "howard calculation", 8597 "0", 8598 self.code_type_map.get("String"), 8599 ) 8600 8601 # Update 8602 sql_update = f""" 8603 UPDATE variants 8604 SET "INFO" = 8605 concat( 8606 CASE 8607 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8608 THEN '' 8609 ELSE concat("INFO", ';') 8610 END, 8611 CASE 8612 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8613 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8614 THEN concat( 8615 '{findbypipeline_tag}=', 8616 dataframe_findbypipeline."{findbypipeline_infos}" 8617 ) 8618 ELSE '' 8619 END 8620 ) 8621 FROM dataframe_findbypipeline 8622 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8623 """ 8624 self.conn.execute(sql_update) 8625 8626 # Remove added columns 8627 for added_column in added_columns: 8628 self.drop_column(column=added_column) 8629 8630 # Delete dataframe 8631 del dataframe_findbypipeline 8632 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
8634 def calculation_genotype_concordance(self) -> None: 8635 """ 8636 The function `calculation_genotype_concordance` calculates the genotype concordance for 8637 multi-caller VCF files and updates the variant information in the database. 8638 """ 8639 8640 # if FORMAT and samples 8641 if ( 8642 "FORMAT" in self.get_header_columns_as_list() 8643 and self.get_header_sample_list() 8644 ): 8645 8646 # genotypeconcordance annotation field 8647 genotypeconcordance_tag = "genotypeconcordance" 8648 8649 # VCF infos tags 8650 vcf_infos_tags = { 8651 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8652 } 8653 8654 # Prefix 8655 prefix = self.get_explode_infos_prefix() 8656 8657 # Field 8658 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8659 8660 # Variants table 8661 table_variants = self.get_table_variants() 8662 8663 # Header 8664 vcf_reader = self.get_header() 8665 8666 # Create variant id 8667 variant_id_column = self.get_variant_id_column() 8668 added_columns = [variant_id_column] 8669 8670 # variant_id, FORMAT and samples 8671 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8672 self.get_header_sample_list() 8673 ) 8674 8675 # Create dataframe 8676 dataframe_genotypeconcordance = self.get_query_to_df( 8677 f""" SELECT {samples_fields} FROM {table_variants} """ 8678 ) 8679 8680 # Create genotypeconcordance column 8681 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8682 dataframe_genotypeconcordance.apply( 8683 lambda row: genotypeconcordance( 8684 row, samples=self.get_header_sample_list() 8685 ), 8686 axis=1, 8687 ) 8688 ) 8689 8690 # Add genotypeconcordance to header 8691 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8692 genotypeconcordance_tag, 8693 ".", 8694 "String", 8695 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8696 "howard calculation", 8697 "0", 8698 self.code_type_map.get("String"), 8699 ) 8700 8701 # Update 8702 sql_update = f""" 8703 UPDATE variants 8704 SET "INFO" = 8705 concat( 8706 CASE 8707 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8708 THEN '' 8709 ELSE concat("INFO", ';') 8710 END, 8711 CASE 8712 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8713 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8714 THEN concat( 8715 '{genotypeconcordance_tag}=', 8716 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8717 ) 8718 ELSE '' 8719 END 8720 ) 8721 FROM dataframe_genotypeconcordance 8722 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8723 """ 8724 self.conn.execute(sql_update) 8725 8726 # Remove added columns 8727 for added_column in added_columns: 8728 self.drop_column(column=added_column) 8729 8730 # Delete dataframe 8731 del dataframe_genotypeconcordance 8732 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
8734 def calculation_barcode(self, tag: str = "barcode") -> None: 8735 """ 8736 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8737 updates the INFO field in the file with the calculated barcode values. 8738 8739 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8740 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8741 the default tag name is set to "barcode", defaults to barcode 8742 :type tag: str (optional) 8743 """ 8744 8745 # if FORMAT and samples 8746 if ( 8747 "FORMAT" in self.get_header_columns_as_list() 8748 and self.get_header_sample_list() 8749 ): 8750 8751 # barcode annotation field 8752 if not tag: 8753 tag = "barcode" 8754 8755 # VCF infos tags 8756 vcf_infos_tags = { 8757 tag: "barcode calculation (VaRank)", 8758 } 8759 8760 # Prefix 8761 prefix = self.get_explode_infos_prefix() 8762 8763 # Field 8764 barcode_infos = prefix + tag 8765 8766 # Variants table 8767 table_variants = self.get_table_variants() 8768 8769 # Header 8770 vcf_reader = self.get_header() 8771 8772 # Create variant id 8773 variant_id_column = self.get_variant_id_column() 8774 added_columns = [variant_id_column] 8775 8776 # variant_id, FORMAT and samples 8777 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8778 self.get_header_sample_list() 8779 ) 8780 8781 # Create dataframe 8782 dataframe_barcode = self.get_query_to_df( 8783 f""" SELECT {samples_fields} FROM {table_variants} """ 8784 ) 8785 8786 # Create barcode column 8787 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8788 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8789 ) 8790 8791 # Add barcode to header 8792 vcf_reader.infos[tag] = vcf.parser._Info( 8793 tag, 8794 ".", 8795 "String", 8796 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8797 "howard calculation", 8798 "0", 8799 self.code_type_map.get("String"), 8800 ) 8801 8802 # Update 8803 sql_update = f""" 8804 UPDATE {table_variants} 8805 SET "INFO" = 8806 concat( 8807 CASE 8808 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8809 THEN '' 8810 ELSE concat("INFO", ';') 8811 END, 8812 CASE 8813 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8814 AND dataframe_barcode."{barcode_infos}" NOT NULL 8815 THEN concat( 8816 '{tag}=', 8817 dataframe_barcode."{barcode_infos}" 8818 ) 8819 ELSE '' 8820 END 8821 ) 8822 FROM dataframe_barcode 8823 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8824 """ 8825 self.conn.execute(sql_update) 8826 8827 # Remove added columns 8828 for added_column in added_columns: 8829 self.drop_column(column=added_column) 8830 8831 # Delete dataframe 8832 del dataframe_barcode 8833 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
8835 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8836 """ 8837 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8838 and updates the INFO field in the file with the calculated barcode values. 8839 8840 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8841 the barcode tag that will be added to the VCF file during the calculation process. If no value 8842 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8843 :type tag: str (optional) 8844 """ 8845 8846 # if FORMAT and samples 8847 if ( 8848 "FORMAT" in self.get_header_columns_as_list() 8849 and self.get_header_sample_list() 8850 ): 8851 8852 # barcode annotation field 8853 if not tag: 8854 tag = "BCF" 8855 8856 # VCF infos tags 8857 vcf_infos_tags = { 8858 tag: "barcode family calculation", 8859 f"{tag}S": "barcode family samples", 8860 } 8861 8862 # Param 8863 param = self.get_param() 8864 log.debug(f"param={param}") 8865 8866 # Prefix 8867 prefix = self.get_explode_infos_prefix() 8868 8869 # PED param 8870 ped = ( 8871 param.get("calculation", {}) 8872 .get("calculations", {}) 8873 .get("BARCODEFAMILY", {}) 8874 .get("family_pedigree", None) 8875 ) 8876 log.debug(f"ped={ped}") 8877 8878 # Load PED 8879 if ped: 8880 8881 # Pedigree is a file 8882 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8883 log.debug("Pedigree is file") 8884 with open(full_path(ped)) as ped: 8885 ped = json.load(ped) 8886 8887 # Pedigree is a string 8888 elif isinstance(ped, str): 8889 log.debug("Pedigree is str") 8890 try: 8891 ped = json.loads(ped) 8892 log.debug("Pedigree is json str") 8893 except ValueError as e: 8894 ped_samples = ped.split(",") 8895 ped = {} 8896 for ped_sample in ped_samples: 8897 ped[ped_sample] = ped_sample 8898 8899 # Pedigree is a dict 8900 elif isinstance(ped, dict): 8901 log.debug("Pedigree is dict") 8902 8903 # Pedigree is not well formatted 8904 else: 8905 msg_error = "Pedigree not well formatted" 8906 log.error(msg_error) 8907 raise ValueError(msg_error) 8908 8909 # Construct list 8910 ped_samples = list(ped.values()) 8911 8912 else: 8913 log.debug("Pedigree not defined. Take all samples") 8914 ped_samples = self.get_header_sample_list() 8915 ped = {} 8916 for ped_sample in ped_samples: 8917 ped[ped_sample] = ped_sample 8918 8919 # Check pedigree 8920 if not ped or len(ped) == 0: 8921 msg_error = f"Error in pedigree: samples {ped_samples}" 8922 log.error(msg_error) 8923 raise ValueError(msg_error) 8924 8925 # Log 8926 log.info( 8927 "Calculation 'BARCODEFAMILY' - Samples: " 8928 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8929 ) 8930 log.debug(f"ped_samples={ped_samples}") 8931 8932 # Field 8933 barcode_infos = prefix + tag 8934 8935 # Variants table 8936 table_variants = self.get_table_variants() 8937 8938 # Header 8939 vcf_reader = self.get_header() 8940 8941 # Create variant id 8942 variant_id_column = self.get_variant_id_column() 8943 added_columns = [variant_id_column] 8944 8945 # variant_id, FORMAT and samples 8946 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8947 ped_samples 8948 ) 8949 8950 # Create dataframe 8951 dataframe_barcode = self.get_query_to_df( 8952 f""" SELECT {samples_fields} FROM {table_variants} """ 8953 ) 8954 8955 # Create barcode column 8956 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8957 lambda row: barcode(row, samples=ped_samples), axis=1 8958 ) 8959 8960 # Add barcode family to header 8961 # Add vaf_normalization to header 8962 vcf_reader.formats[tag] = vcf.parser._Format( 8963 id=tag, 8964 num=".", 8965 type="String", 8966 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8967 type_code=self.code_type_map.get("String"), 8968 ) 8969 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8970 id=f"{tag}S", 8971 num=".", 8972 type="String", 8973 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8974 type_code=self.code_type_map.get("String"), 8975 ) 8976 8977 # Update 8978 # for sample in ped_samples: 8979 sql_update_set = [] 8980 for sample in self.get_header_sample_list() + ["FORMAT"]: 8981 if sample in ped_samples: 8982 value = f'dataframe_barcode."{barcode_infos}"' 8983 value_samples = "'" + ",".join(ped_samples) + "'" 8984 elif sample == "FORMAT": 8985 value = f"'{tag}'" 8986 value_samples = f"'{tag}S'" 8987 else: 8988 value = "'.'" 8989 value_samples = "'.'" 8990 format_regex = r"[a-zA-Z0-9\s]" 8991 sql_update_set.append( 8992 f""" 8993 "{sample}" = 8994 concat( 8995 CASE 8996 WHEN {table_variants}."{sample}" = './.' 8997 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8998 ELSE {table_variants}."{sample}" 8999 END, 9000 ':', 9001 {value}, 9002 ':', 9003 {value_samples} 9004 ) 9005 """ 9006 ) 9007 9008 sql_update_set_join = ", ".join(sql_update_set) 9009 sql_update = f""" 9010 UPDATE {table_variants} 9011 SET {sql_update_set_join} 9012 FROM dataframe_barcode 9013 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9014 """ 9015 self.conn.execute(sql_update) 9016 9017 # Remove added columns 9018 for added_column in added_columns: 9019 self.drop_column(column=added_column) 9020 9021 # Delete dataframe 9022 del dataframe_barcode 9023 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9025 def calculation_trio(self) -> None: 9026 """ 9027 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9028 information to the INFO field of each variant. 9029 """ 9030 9031 # if FORMAT and samples 9032 if ( 9033 "FORMAT" in self.get_header_columns_as_list() 9034 and self.get_header_sample_list() 9035 ): 9036 9037 # trio annotation field 9038 trio_tag = "trio" 9039 9040 # VCF infos tags 9041 vcf_infos_tags = { 9042 "trio": "trio calculation", 9043 } 9044 9045 # Param 9046 param = self.get_param() 9047 9048 # Prefix 9049 prefix = self.get_explode_infos_prefix() 9050 9051 # Trio param 9052 trio_ped = ( 9053 param.get("calculation", {}) 9054 .get("calculations", {}) 9055 .get("TRIO", {}) 9056 .get("trio_pedigree", None) 9057 ) 9058 9059 # Load trio 9060 if trio_ped: 9061 9062 # Trio pedigree is a file 9063 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9064 log.debug("TRIO pedigree is file") 9065 with open(full_path(trio_ped)) as trio_ped: 9066 trio_ped = json.load(trio_ped) 9067 9068 # Trio pedigree is a string 9069 elif isinstance(trio_ped, str): 9070 log.debug("TRIO pedigree is str") 9071 try: 9072 trio_ped = json.loads(trio_ped) 9073 log.debug("TRIO pedigree is json str") 9074 except ValueError as e: 9075 trio_samples = trio_ped.split(",") 9076 if len(trio_samples) == 3: 9077 trio_ped = { 9078 "father": trio_samples[0], 9079 "mother": trio_samples[1], 9080 "child": trio_samples[2], 9081 } 9082 log.debug("TRIO pedigree is list str") 9083 else: 9084 msg_error = "TRIO pedigree not well formatted" 9085 log.error(msg_error) 9086 raise ValueError(msg_error) 9087 9088 # Trio pedigree is a dict 9089 elif isinstance(trio_ped, dict): 9090 log.debug("TRIO pedigree is dict") 9091 9092 # Trio pedigree is not well formatted 9093 else: 9094 msg_error = "TRIO pedigree not well formatted" 9095 log.error(msg_error) 9096 raise ValueError(msg_error) 9097 9098 # Construct trio list 9099 trio_samples = [ 9100 trio_ped.get("father", ""), 9101 trio_ped.get("mother", ""), 9102 trio_ped.get("child", ""), 9103 ] 9104 9105 else: 9106 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9107 samples_list = self.get_header_sample_list() 9108 if len(samples_list) >= 3: 9109 trio_samples = self.get_header_sample_list()[0:3] 9110 trio_ped = { 9111 "father": trio_samples[0], 9112 "mother": trio_samples[1], 9113 "child": trio_samples[2], 9114 } 9115 else: 9116 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9117 log.error(msg_error) 9118 raise ValueError(msg_error) 9119 9120 # Check trio pedigree 9121 if not trio_ped or len(trio_ped) != 3: 9122 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9123 log.error(msg_error) 9124 raise ValueError(msg_error) 9125 9126 # Log 9127 log.info( 9128 f"Calculation 'TRIO' - Samples: " 9129 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9130 ) 9131 9132 # Field 9133 trio_infos = prefix + trio_tag 9134 9135 # Variants table 9136 table_variants = self.get_table_variants() 9137 9138 # Header 9139 vcf_reader = self.get_header() 9140 9141 # Create variant id 9142 variant_id_column = self.get_variant_id_column() 9143 added_columns = [variant_id_column] 9144 9145 # variant_id, FORMAT and samples 9146 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9147 self.get_header_sample_list() 9148 ) 9149 9150 # Create dataframe 9151 dataframe_trio = self.get_query_to_df( 9152 f""" SELECT {samples_fields} FROM {table_variants} """ 9153 ) 9154 9155 # Create trio column 9156 dataframe_trio[trio_infos] = dataframe_trio.apply( 9157 lambda row: trio(row, samples=trio_samples), axis=1 9158 ) 9159 9160 # Add trio to header 9161 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9162 trio_tag, 9163 ".", 9164 "String", 9165 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9166 "howard calculation", 9167 "0", 9168 self.code_type_map.get("String"), 9169 ) 9170 9171 # Update 9172 sql_update = f""" 9173 UPDATE {table_variants} 9174 SET "INFO" = 9175 concat( 9176 CASE 9177 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9178 THEN '' 9179 ELSE concat("INFO", ';') 9180 END, 9181 CASE 9182 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9183 AND dataframe_trio."{trio_infos}" NOT NULL 9184 THEN concat( 9185 '{trio_tag}=', 9186 dataframe_trio."{trio_infos}" 9187 ) 9188 ELSE '' 9189 END 9190 ) 9191 FROM dataframe_trio 9192 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9193 """ 9194 self.conn.execute(sql_update) 9195 9196 # Remove added columns 9197 for added_column in added_columns: 9198 self.drop_column(column=added_column) 9199 9200 # Delete dataframe 9201 del dataframe_trio 9202 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9204 def calculation_vaf_normalization(self) -> None: 9205 """ 9206 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9207 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9208 :return: The function does not return anything. 9209 """ 9210 9211 # if FORMAT and samples 9212 if ( 9213 "FORMAT" in self.get_header_columns_as_list() 9214 and self.get_header_sample_list() 9215 ): 9216 9217 # vaf_normalization annotation field 9218 vaf_normalization_tag = "VAF" 9219 9220 # VCF infos tags 9221 vcf_infos_tags = { 9222 "VAF": "VAF Variant Frequency", 9223 } 9224 9225 # Prefix 9226 prefix = self.get_explode_infos_prefix() 9227 9228 # Variants table 9229 table_variants = self.get_table_variants() 9230 9231 # Header 9232 vcf_reader = self.get_header() 9233 9234 # Do not calculate if VAF already exists 9235 if "VAF" in vcf_reader.formats: 9236 log.debug("VAF already on genotypes") 9237 return 9238 9239 # Create variant id 9240 variant_id_column = self.get_variant_id_column() 9241 added_columns = [variant_id_column] 9242 9243 # variant_id, FORMAT and samples 9244 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9245 f""" "{sample}" """ for sample in self.get_header_sample_list() 9246 ) 9247 9248 # Create dataframe 9249 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9250 log.debug(f"query={query}") 9251 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9252 9253 vaf_normalization_set = [] 9254 9255 # for each sample vaf_normalization 9256 for sample in self.get_header_sample_list(): 9257 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9258 lambda row: vaf_normalization(row, sample=sample), axis=1 9259 ) 9260 vaf_normalization_set.append( 9261 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9262 ) 9263 9264 # Add VAF to FORMAT 9265 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9266 "FORMAT" 9267 ].apply(lambda x: str(x) + ":VAF") 9268 vaf_normalization_set.append( 9269 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9270 ) 9271 9272 # Add vaf_normalization to header 9273 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9274 id=vaf_normalization_tag, 9275 num="1", 9276 type="Float", 9277 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9278 type_code=self.code_type_map.get("Float"), 9279 ) 9280 9281 # Create fields to add in INFO 9282 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9283 9284 # Update 9285 sql_update = f""" 9286 UPDATE {table_variants} 9287 SET {sql_vaf_normalization_set} 9288 FROM dataframe_vaf_normalization 9289 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9290 9291 """ 9292 self.conn.execute(sql_update) 9293 9294 # Remove added columns 9295 for added_column in added_columns: 9296 self.drop_column(column=added_column) 9297 9298 # Delete dataframe 9299 del dataframe_vaf_normalization 9300 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9302 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9303 """ 9304 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9305 field in a VCF file and updates the INFO column of the variants table with the calculated 9306 statistics. 9307 9308 :param info: The `info` parameter is a string that represents the type of information for which 9309 genotype statistics are calculated. It is used to generate various VCF info tags for the 9310 statistics, such as the number of occurrences, the list of values, the minimum value, the 9311 maximum value, the mean, the median, defaults to VAF 9312 :type info: str (optional) 9313 """ 9314 9315 # if FORMAT and samples 9316 if ( 9317 "FORMAT" in self.get_header_columns_as_list() 9318 and self.get_header_sample_list() 9319 ): 9320 9321 # vaf_stats annotation field 9322 vaf_stats_tag = info + "_stats" 9323 9324 # VCF infos tags 9325 vcf_infos_tags = { 9326 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9327 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9328 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9329 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9330 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9331 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9332 info 9333 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9334 } 9335 9336 # Prefix 9337 prefix = self.get_explode_infos_prefix() 9338 9339 # Field 9340 vaf_stats_infos = prefix + vaf_stats_tag 9341 9342 # Variants table 9343 table_variants = self.get_table_variants() 9344 9345 # Header 9346 vcf_reader = self.get_header() 9347 9348 # Create variant id 9349 variant_id_column = self.get_variant_id_column() 9350 added_columns = [variant_id_column] 9351 9352 # variant_id, FORMAT and samples 9353 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9354 self.get_header_sample_list() 9355 ) 9356 9357 # Create dataframe 9358 dataframe_vaf_stats = self.get_query_to_df( 9359 f""" SELECT {samples_fields} FROM {table_variants} """ 9360 ) 9361 9362 # Create vaf_stats column 9363 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9364 lambda row: genotype_stats( 9365 row, samples=self.get_header_sample_list(), info=info 9366 ), 9367 axis=1, 9368 ) 9369 9370 # List of vcf tags 9371 sql_vaf_stats_fields = [] 9372 9373 # Check all VAF stats infos 9374 for stat in vcf_infos_tags: 9375 9376 # Extract stats 9377 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9378 lambda x: dict(x).get(stat, "") 9379 ) 9380 9381 # Add snpeff_hgvs to header 9382 vcf_reader.infos[stat] = vcf.parser._Info( 9383 stat, 9384 ".", 9385 "String", 9386 vcf_infos_tags.get(stat, "genotype statistics"), 9387 "howard calculation", 9388 "0", 9389 self.code_type_map.get("String"), 9390 ) 9391 9392 if len(sql_vaf_stats_fields): 9393 sep = ";" 9394 else: 9395 sep = "" 9396 9397 # Create fields to add in INFO 9398 sql_vaf_stats_fields.append( 9399 f""" 9400 CASE 9401 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9402 THEN concat( 9403 '{sep}{stat}=', 9404 dataframe_vaf_stats."{stat}" 9405 ) 9406 ELSE '' 9407 END 9408 """ 9409 ) 9410 9411 # SQL set for update 9412 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9413 9414 # Update 9415 sql_update = f""" 9416 UPDATE {table_variants} 9417 SET "INFO" = 9418 concat( 9419 CASE 9420 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9421 THEN '' 9422 ELSE concat("INFO", ';') 9423 END, 9424 {sql_vaf_stats_fields_set} 9425 ) 9426 FROM dataframe_vaf_stats 9427 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9428 9429 """ 9430 self.conn.execute(sql_update) 9431 9432 # Remove added columns 9433 for added_column in added_columns: 9434 self.drop_column(column=added_column) 9435 9436 # Delete dataframe 9437 del dataframe_vaf_stats 9438 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
9440 def calculation_transcripts_annotation( 9441 self, info_json: str = None, info_format: str = None 9442 ) -> None: 9443 """ 9444 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9445 field to it if transcripts are available. 9446 9447 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9448 is a string parameter that represents the information field to be used in the transcripts JSON. 9449 It is used to specify the JSON format for the transcripts information. If no value is provided 9450 when calling the method, it defaults to " 9451 :type info_json: str 9452 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9453 method is a string parameter that specifies the format of the information field to be used in 9454 the transcripts JSON. It is used to define the format of the information field 9455 :type info_format: str 9456 """ 9457 9458 # Create transcripts table 9459 transcripts_table = self.create_transcript_view() 9460 9461 # Add info field 9462 if transcripts_table: 9463 self.transcript_view_to_variants( 9464 transcripts_table=transcripts_table, 9465 transcripts_info_field_json=info_json, 9466 transcripts_info_field_format=info_format, 9467 ) 9468 else: 9469 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
9471 def calculation_transcripts_prioritization(self) -> None: 9472 """ 9473 The function `calculation_transcripts_prioritization` creates a transcripts table and 9474 prioritizes transcripts based on certain criteria. 9475 """ 9476 9477 # Create transcripts table 9478 transcripts_table = self.create_transcript_view() 9479 9480 # Add info field 9481 if transcripts_table: 9482 self.transcripts_prioritization(transcripts_table=transcripts_table) 9483 else: 9484 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
9490 def transcripts_prioritization( 9491 self, transcripts_table: str = None, param: dict = {} 9492 ) -> bool: 9493 """ 9494 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 9495 and updates the variants table with the prioritized information. 9496 9497 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 9498 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 9499 This parameter is used to identify the table where the transcripts data is stored for the 9500 prioritization process 9501 :type transcripts_table: str 9502 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 9503 that contains various configuration settings for the prioritization process of transcripts. It 9504 is used to customize the behavior of the prioritization algorithm and includes settings such as 9505 the prefix for prioritization fields, default profiles, and other 9506 :type param: dict 9507 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 9508 transcripts prioritization process is successfully completed, and `False` if there are any 9509 issues or if no profile is defined for transcripts prioritization. 9510 """ 9511 9512 log.debug("Start transcripts prioritization...") 9513 9514 # Param 9515 if not param: 9516 param = self.get_param() 9517 9518 # Variants table 9519 table_variants = self.get_table_variants() 9520 log.debug(f"transcripts_table={transcripts_table}") 9521 # Transcripts table 9522 if transcripts_table is None: 9523 log.debug(f"transcripts_table={transcripts_table}") 9524 transcripts_table = self.create_transcript_view( 9525 transcripts_table="transcripts", param=param 9526 ) 9527 log.debug(f"transcripts_table={transcripts_table}") 9528 if transcripts_table is None: 9529 msg_err = "No Transcripts table availalble" 9530 log.error(msg_err) 9531 raise ValueError(msg_err) 9532 9533 # Get transcripts columns 9534 columns_as_list_query = f""" 9535 DESCRIBE {transcripts_table} 9536 """ 9537 columns_as_list = list( 9538 self.get_query_to_df(columns_as_list_query)["column_name"] 9539 ) 9540 9541 # Create INFO if not exists 9542 if "INFO" not in columns_as_list: 9543 query_add_info = f""" 9544 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 9545 """ 9546 self.execute_query(query_add_info) 9547 9548 # Prioritization param and Force only PZ Score and Flag 9549 pz_param = param.get("transcripts", {}).get("prioritization", {}) 9550 pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score" 9551 pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag" 9552 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 9553 pz_param["pzfields"] = [pz_fields_score, pz_fields_flag] 9554 pz_profile_default = ( 9555 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 9556 ) 9557 9558 # Exit if no profile 9559 if pz_profile_default is None: 9560 log.warning("No profile defined for transcripts prioritization") 9561 return False 9562 9563 # Prioritization 9564 prioritization_result = self.prioritization( 9565 table=transcripts_table, 9566 pz_param=param.get("transcripts", {}).get("prioritization", {}), 9567 ) 9568 if not prioritization_result: 9569 log.warning("Transcripts prioritization not processed") 9570 return False 9571 9572 # Explode PZ fields 9573 self.explode_infos( 9574 table=transcripts_table, 9575 fields=param.get("transcripts", {}) 9576 .get("prioritization", {}) 9577 .get("pzfields", []), 9578 ) 9579 9580 # Export Transcripts prioritization infos to variants table 9581 query_update = f""" 9582 WITH RankedTranscripts AS ( 9583 SELECT 9584 "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag}, 9585 ROW_NUMBER() OVER ( 9586 PARTITION BY "#CHROM", POS, REF, ALT 9587 ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC 9588 ) AS rn 9589 FROM 9590 {transcripts_table} 9591 ) 9592 UPDATE {table_variants} 9593 SET 9594 INFO = CONCAT(CASE 9595 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9596 THEN '' 9597 ELSE concat("INFO", ';') 9598 END, 9599 concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag}) 9600 ) 9601 FROM 9602 RankedTranscripts 9603 WHERE 9604 rn = 1 9605 AND variants."#CHROM" = RankedTranscripts."#CHROM" 9606 AND variants."POS" = RankedTranscripts."POS" 9607 AND variants."REF" = RankedTranscripts."REF" 9608 AND variants."ALT" = RankedTranscripts."ALT" 9609 9610 """ 9611 self.execute_query(query=query_update) 9612 9613 # Add PZ Transcript in header 9614 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 9615 pz_fields_transcripts, 9616 ".", 9617 "String", 9618 f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}", 9619 "unknown", 9620 "unknown", 9621 code_type_map["String"], 9622 ) 9623 9624 # Return 9625 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
9627 def create_transcript_view_from_columns_map( 9628 self, 9629 transcripts_table: str = "transcripts", 9630 columns_maps: dict = {}, 9631 added_columns: list = [], 9632 temporary_tables: list = None, 9633 annotation_fields: list = None, 9634 ) -> tuple[list, list, list]: 9635 """ 9636 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 9637 specified columns mapping for transcripts data. 9638 9639 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9640 the table where the transcripts data is stored or will be stored in the database. This table 9641 typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, 9642 predictions, etc. It defaults to "transcripts, defaults to transcripts 9643 :type transcripts_table: str (optional) 9644 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about 9645 how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list 9646 represents a mapping configuration for a specific set of columns. It typically includes details such 9647 as the main transcript column and additional information columns 9648 :type columns_maps: dict 9649 :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map` 9650 function is a list that stores the additional columns that will be added to the view being created 9651 based on the columns map provided. These columns are generated by exploding the transcript 9652 information columns along with the main transcript column 9653 :type added_columns: list 9654 :param temporary_tables: The `temporary_tables` parameter in the 9655 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 9656 tables created during the process of creating a transcript view from a columns map. These temporary 9657 tables are used to store intermediate results or transformations before the final view is generated 9658 :type temporary_tables: list 9659 :param annotation_fields: The `annotation_fields` parameter in the 9660 `create_transcript_view_from_columns_map` function is a list that stores the fields that are used 9661 for annotation in the query view creation process. These fields are extracted from the 9662 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 9663 :type annotation_fields: list 9664 :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three 9665 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 9666 """ 9667 9668 log.debug("Start transcrpts view creation from columns map...") 9669 9670 # "from_columns_map": [ 9671 # { 9672 # "transcripts_column": "Ensembl_transcriptid", 9673 # "transcripts_infos_columns": [ 9674 # "genename", 9675 # "Ensembl_geneid", 9676 # "LIST_S2_score", 9677 # "LIST_S2_pred", 9678 # ], 9679 # }, 9680 # { 9681 # "transcripts_column": "Ensembl_transcriptid", 9682 # "transcripts_infos_columns": [ 9683 # "genename", 9684 # "VARITY_R_score", 9685 # "Aloft_pred", 9686 # ], 9687 # }, 9688 # ], 9689 9690 # Init 9691 if temporary_tables is None: 9692 temporary_tables = [] 9693 if annotation_fields is None: 9694 annotation_fields = [] 9695 9696 # Variants table 9697 table_variants = self.get_table_variants() 9698 9699 for columns_map in columns_maps: 9700 9701 # Transcript column 9702 transcripts_column = columns_map.get("transcripts_column", None) 9703 9704 # Transcripts infos columns 9705 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 9706 9707 if transcripts_column is not None: 9708 9709 # Explode 9710 added_columns += self.explode_infos( 9711 fields=[transcripts_column] + transcripts_infos_columns 9712 ) 9713 9714 # View clauses 9715 clause_select = [] 9716 for field in [transcripts_column] + transcripts_infos_columns: 9717 clause_select.append( 9718 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 9719 ) 9720 if field not in [transcripts_column]: 9721 annotation_fields.append(field) 9722 9723 # Querey View 9724 query = f""" 9725 SELECT 9726 "#CHROM", POS, REF, ALT, INFO, 9727 "{transcripts_column}" AS 'transcript', 9728 {", ".join(clause_select)} 9729 FROM ( 9730 SELECT 9731 "#CHROM", POS, REF, ALT, INFO, 9732 {", ".join(clause_select)} 9733 FROM {table_variants} 9734 ) 9735 WHERE "{transcripts_column}" IS NOT NULL 9736 """ 9737 9738 # Create temporary table 9739 temporary_table = transcripts_table + "".join( 9740 random.choices(string.ascii_uppercase + string.digits, k=10) 9741 ) 9742 9743 # Temporary_tables 9744 temporary_tables.append(temporary_table) 9745 query_view = f""" 9746 CREATE TEMPORARY TABLE {temporary_table} 9747 AS ({query}) 9748 """ 9749 self.execute_query(query=query_view) 9750 9751 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns
Returns
The function
create_transcript_view_from_columns_mapreturns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
9753 def create_transcript_view_from_column_format( 9754 self, 9755 transcripts_table: str = "transcripts", 9756 column_formats: dict = {}, 9757 temporary_tables: list = None, 9758 annotation_fields: list = None, 9759 ) -> tuple[list, list, list]: 9760 """ 9761 The `create_transcript_view_from_column_format` function generates a transcript view based on 9762 specified column formats, adds additional columns and annotation fields, and returns the list of 9763 temporary tables and annotation fields. 9764 9765 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of 9766 the table containing the transcripts data. This table will be used as the base table for creating 9767 the transcript view. The default value for this parameter is "transcripts", but you can provide a 9768 different table name if needed, defaults to transcripts 9769 :type transcripts_table: str (optional) 9770 :param column_formats: The `column_formats` parameter is a dictionary that contains information 9771 about the columns to be used for creating the transcript view. Each entry in the dictionary 9772 specifies the mapping between a transcripts column and a transcripts infos column. For example, in 9773 the provided code snippet: 9774 :type column_formats: dict 9775 :param temporary_tables: The `temporary_tables` parameter in the 9776 `create_transcript_view_from_column_format` function is a list that stores the names of temporary 9777 views created during the process of creating a transcript view from a column format. These temporary 9778 views are used to manipulate and extract data before generating the final transcript view. It 9779 :type temporary_tables: list 9780 :param annotation_fields: The `annotation_fields` parameter in the 9781 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 9782 that are extracted from the temporary views created during the process. These annotation fields are 9783 obtained by querying the temporary views and extracting the column names excluding specific columns 9784 like `#CH 9785 :type annotation_fields: list 9786 :return: The `create_transcript_view_from_column_format` function returns two lists: 9787 `temporary_tables` and `annotation_fields`. 9788 """ 9789 9790 log.debug("Start transcrpts view creation from column format...") 9791 9792 # "from_column_format": [ 9793 # { 9794 # "transcripts_column": "ANN", 9795 # "transcripts_infos_column": "Feature_ID", 9796 # } 9797 # ], 9798 9799 # Init 9800 if temporary_tables is None: 9801 temporary_tables = [] 9802 if annotation_fields is None: 9803 annotation_fields = [] 9804 9805 for column_format in column_formats: 9806 9807 # annotation field and transcript annotation field 9808 annotation_field = column_format.get("transcripts_column", "ANN") 9809 transcript_annotation = column_format.get( 9810 "transcripts_infos_column", "Feature_ID" 9811 ) 9812 9813 # Temporary View name 9814 temporary_view_name = transcripts_table + "".join( 9815 random.choices(string.ascii_uppercase + string.digits, k=10) 9816 ) 9817 9818 # Create temporary view name 9819 temporary_view_name = self.annotation_format_to_table( 9820 uniquify=True, 9821 annotation_field=annotation_field, 9822 view_name=temporary_view_name, 9823 annotation_id=transcript_annotation, 9824 ) 9825 9826 # Annotation fields 9827 if temporary_view_name: 9828 query_annotation_fields = f""" 9829 SELECT * 9830 FROM ( 9831 DESCRIBE SELECT * 9832 FROM {temporary_view_name} 9833 ) 9834 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 9835 """ 9836 df_annotation_fields = self.get_query_to_df( 9837 query=query_annotation_fields 9838 ) 9839 9840 # Add temporary view and annotation fields 9841 temporary_tables.append(temporary_view_name) 9842 annotation_fields += list(set(df_annotation_fields["column_name"])) 9843 9844 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet: - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
9846 def create_transcript_view( 9847 self, 9848 transcripts_table: str = None, 9849 transcripts_table_drop: bool = True, 9850 param: dict = {}, 9851 ) -> str: 9852 """ 9853 The `create_transcript_view` function generates a transcript view by processing data from a 9854 specified table based on provided parameters and structural information. 9855 9856 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 9857 is used to specify the name of the table that will store the final transcript view data. If a table 9858 name is not provided, the function will create a new table to store the transcript view data, and by 9859 default,, defaults to transcripts 9860 :type transcripts_table: str (optional) 9861 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 9862 `create_transcript_view` function is a boolean parameter that determines whether to drop the 9863 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 9864 the function will drop the existing transcripts table if it exists, defaults to True 9865 :type transcripts_table_drop: bool (optional) 9866 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 9867 contains information needed to create a transcript view. It includes details such as the structure 9868 of the transcripts, columns mapping, column formats, and other necessary information for generating 9869 the view. This parameter allows for flexibility and customization 9870 :type param: dict 9871 :return: The `create_transcript_view` function returns the name of the transcripts table that was 9872 created or modified during the execution of the function. 9873 """ 9874 9875 log.debug("Start transcripts view creation...") 9876 9877 # Default 9878 transcripts_table_default = "transcripts" 9879 9880 # Param 9881 if not param: 9882 param = self.get_param() 9883 9884 # Struct 9885 struct = param.get("transcripts", {}).get("struct", None) 9886 9887 if struct: 9888 9889 # Transcripts table 9890 if transcripts_table is None: 9891 transcripts_table = param.get("transcripts", {}).get( 9892 "table", transcripts_table_default 9893 ) 9894 9895 # added_columns 9896 added_columns = [] 9897 9898 # Temporary tables 9899 temporary_tables = [] 9900 9901 # Annotation fields 9902 annotation_fields = [] 9903 9904 # from columns map 9905 columns_maps = struct.get("from_columns_map", []) 9906 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 9907 self.create_transcript_view_from_columns_map( 9908 transcripts_table=transcripts_table, 9909 columns_maps=columns_maps, 9910 added_columns=added_columns, 9911 temporary_tables=temporary_tables, 9912 annotation_fields=annotation_fields, 9913 ) 9914 ) 9915 added_columns += added_columns_tmp 9916 temporary_tables += temporary_tables_tmp 9917 annotation_fields += annotation_fields_tmp 9918 9919 # from column format 9920 column_formats = struct.get("from_column_format", []) 9921 temporary_tables_tmp, annotation_fields_tmp = ( 9922 self.create_transcript_view_from_column_format( 9923 transcripts_table=transcripts_table, 9924 column_formats=column_formats, 9925 temporary_tables=temporary_tables, 9926 annotation_fields=annotation_fields, 9927 ) 9928 ) 9929 temporary_tables += temporary_tables_tmp 9930 annotation_fields += annotation_fields_tmp 9931 9932 # Merge temporary tables query 9933 query_merge = "" 9934 for temporary_table in temporary_tables: 9935 9936 # First temporary table 9937 if not query_merge: 9938 query_merge = f""" 9939 SELECT * FROM {temporary_table} 9940 """ 9941 # other temporary table (using UNION) 9942 else: 9943 query_merge += f""" 9944 UNION BY NAME SELECT * FROM {temporary_table} 9945 """ 9946 9947 # Merge on transcript 9948 query_merge_on_transcripts_annotation_fields = [] 9949 # Aggregate all annotations fields 9950 for annotation_field in set(annotation_fields): 9951 query_merge_on_transcripts_annotation_fields.append( 9952 f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """ 9953 ) 9954 # Query for transcripts view 9955 query_merge_on_transcripts = f""" 9956 SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)} 9957 FROM ({query_merge}) 9958 GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript 9959 """ 9960 9961 # Drop transcript view is necessary 9962 if transcripts_table_drop: 9963 query_drop = f""" 9964 DROP TABLE IF EXISTS {transcripts_table}; 9965 """ 9966 self.execute_query(query=query_drop) 9967 9968 # Merge and create transcript view 9969 query_create_view = f""" 9970 CREATE TABLE IF NOT EXISTS {transcripts_table} 9971 AS {query_merge_on_transcripts} 9972 """ 9973 self.execute_query(query=query_create_view) 9974 9975 # Remove added columns 9976 for added_column in added_columns: 9977 self.drop_column(column=added_column) 9978 9979 else: 9980 9981 transcripts_table = None 9982 9983 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to True - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
9985 def annotation_format_to_table( 9986 self, 9987 uniquify: bool = True, 9988 annotation_field: str = "ANN", 9989 annotation_id: str = "Feature_ID", 9990 view_name: str = "transcripts", 9991 ) -> str: 9992 """ 9993 The function `annotation_format_to_table` converts annotation data from a VCF file into a structured 9994 table format. 9995 9996 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique 9997 values in the output or not. If set to `True`, the function will make sure that the output values 9998 are unique, defaults to True 9999 :type uniquify: bool (optional) 10000 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that 10001 contains the annotation information for each variant. This field is used to extract the annotation 10002 details for further processing in the function, defaults to ANN 10003 :type annotation_field: str (optional) 10004 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is 10005 used to specify the identifier for the annotation feature. This identifier will be used as a column 10006 name in the resulting table or view that is created based on the annotation data. It helps in 10007 uniquely identifying each annotation entry in the, defaults to Feature_ID 10008 :type annotation_id: str (optional) 10009 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to 10010 specify the name of the temporary table that will be created to store the transformed annotation 10011 data. This table will hold the extracted information from the annotation field in a structured 10012 format for further processing or analysis, defaults to transcripts 10013 :type view_name: str (optional) 10014 :return: The function `annotation_format_to_table` is returning the name of the view created, which 10015 is stored in the variable `view_name`. 10016 """ 10017 10018 # Annotation field 10019 annotation_format = "annotation_explode" 10020 10021 # Transcript annotation 10022 annotation_id = "".join(char for char in annotation_id if char.isalnum()) 10023 10024 # Prefix 10025 prefix = self.get_explode_infos_prefix() 10026 if prefix: 10027 prefix = "INFO/" 10028 10029 # Annotation fields 10030 annotation_infos = prefix + annotation_field 10031 annotation_format_infos = prefix + annotation_format 10032 10033 # Variants table 10034 table_variants = self.get_table_variants() 10035 10036 # Header 10037 vcf_reader = self.get_header() 10038 10039 # Add columns 10040 added_columns = [] 10041 10042 # Explode HGVS field in column 10043 added_columns += self.explode_infos(fields=[annotation_field]) 10044 10045 if annotation_field in vcf_reader.infos: 10046 10047 # Extract ANN header 10048 ann_description = vcf_reader.infos[annotation_field].desc 10049 pattern = r"'(.+?)'" 10050 match = re.search(pattern, ann_description) 10051 if match: 10052 ann_header_match = match.group(1).split(" | ") 10053 ann_header = [] 10054 ann_header_desc = {} 10055 for i in range(len(ann_header_match)): 10056 ann_header_info = "".join( 10057 char for char in ann_header_match[i] if char.isalnum() 10058 ) 10059 ann_header.append(ann_header_info) 10060 ann_header_desc[ann_header_info] = ann_header_match[i] 10061 if not ann_header_desc: 10062 raise ValueError("Invalid header description format") 10063 else: 10064 raise ValueError("Invalid header description format") 10065 10066 # Create variant id 10067 variant_id_column = self.get_variant_id_column() 10068 added_columns += [variant_id_column] 10069 10070 # Create dataframe 10071 dataframe_annotation_format = self.get_query_to_df( 10072 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 10073 ) 10074 10075 # Create annotation columns 10076 dataframe_annotation_format[ 10077 annotation_format_infos 10078 ] = dataframe_annotation_format[annotation_infos].apply( 10079 lambda x: explode_annotation_format( 10080 annotation=str(x), 10081 uniquify=uniquify, 10082 output_format="JSON", 10083 prefix="", 10084 header=list(ann_header_desc.values()), 10085 ) 10086 ) 10087 10088 # Find keys 10089 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 10090 df_keys = self.get_query_to_df(query=query_json) 10091 10092 # Check keys 10093 query_json_key = [] 10094 for _, row in df_keys.iterrows(): 10095 10096 # Key 10097 key = row.iloc[0] 10098 10099 # key_clean 10100 key_clean = "".join(char for char in key if char.isalnum()) 10101 10102 # Type 10103 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 10104 10105 # Get DataFrame from query 10106 df_json_type = self.get_query_to_df(query=query_json_type) 10107 10108 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 10109 with pd.option_context("future.no_silent_downcasting", True): 10110 df_json_type.fillna(value="", inplace=True) 10111 replace_dict = {None: np.nan, "": np.nan} 10112 df_json_type.replace(replace_dict, inplace=True) 10113 df_json_type.dropna(inplace=True) 10114 10115 # Detect column type 10116 column_type = detect_column_type(df_json_type[key_clean]) 10117 10118 # Append 10119 query_json_key.append( 10120 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 10121 ) 10122 10123 # Create view 10124 query_view = f""" 10125 CREATE TEMPORARY TABLE {view_name} 10126 AS ( 10127 SELECT *, {annotation_id} AS 'transcript' 10128 FROM ( 10129 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 10130 FROM dataframe_annotation_format 10131 ) 10132 ); 10133 """ 10134 self.execute_query(query=query_view) 10135 10136 else: 10137 10138 # Return None 10139 view_name = None 10140 10141 # Remove added columns 10142 for added_column in added_columns: 10143 self.drop_column(column=added_column) 10144 10145 return view_name
The function annotation_format_to_table converts annotation data from a VCF file into a structured
table format.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
10147 def transcript_view_to_variants( 10148 self, 10149 transcripts_table: str = None, 10150 transcripts_column_id: str = None, 10151 transcripts_info_json: str = None, 10152 transcripts_info_field_json: str = None, 10153 transcripts_info_format: str = None, 10154 transcripts_info_field_format: str = None, 10155 param: dict = {}, 10156 ) -> bool: 10157 """ 10158 The `transcript_view_to_variants` function updates a variants table with information from 10159 transcripts in JSON format. 10160 10161 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 10162 table containing the transcripts data. If this parameter is not provided, the function will 10163 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 10164 :type transcripts_table: str 10165 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 10166 column in the `transcripts_table` that contains the unique identifier for each transcript. This 10167 identifier is used to match transcripts with variants in the database 10168 :type transcripts_column_id: str 10169 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 10170 of the column in the variants table where the transcripts information will be stored in JSON 10171 format. This parameter allows you to define the column in the variants table that will hold the 10172 JSON-formatted information about transcripts 10173 :type transcripts_info_json: str 10174 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 10175 specify the field in the VCF header that will contain information about transcripts in JSON 10176 format. This field will be added to the VCF header as an INFO field with the specified name 10177 :type transcripts_info_field_json: str 10178 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 10179 format of the information about transcripts that will be stored in the variants table. This 10180 format can be used to define how the transcript information will be structured or displayed 10181 within the variants table 10182 :type transcripts_info_format: str 10183 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 10184 specify the field in the VCF header that will contain information about transcripts in a 10185 specific format. This field will be added to the VCF header as an INFO field with the specified 10186 name 10187 :type transcripts_info_field_format: str 10188 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 10189 that contains various configuration settings related to transcripts. It is used to provide 10190 default values for certain parameters if they are not explicitly provided when calling the 10191 method. The `param` dictionary can be passed as an argument 10192 :type param: dict 10193 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 10194 if the operation is successful and `False` if certain conditions are not met. 10195 """ 10196 10197 msg_info_prefix = "Start transcripts view to variants annotations" 10198 10199 log.debug(f"{msg_info_prefix}...") 10200 10201 # Default 10202 transcripts_table_default = "transcripts" 10203 transcripts_column_id_default = "transcript" 10204 transcripts_info_json_default = None 10205 transcripts_info_format_default = None 10206 transcripts_info_field_json_default = None 10207 transcripts_info_field_format_default = None 10208 10209 # Param 10210 if not param: 10211 param = self.get_param() 10212 10213 # Transcripts table 10214 if transcripts_table is None: 10215 transcripts_table = param.get("transcripts", {}).get( 10216 "table", transcripts_table_default 10217 ) 10218 10219 # Transcripts column ID 10220 if transcripts_column_id is None: 10221 transcripts_column_id = param.get("transcripts", {}).get( 10222 "column_id", transcripts_column_id_default 10223 ) 10224 10225 # Transcripts info json 10226 if transcripts_info_json is None: 10227 transcripts_info_json = param.get("transcripts", {}).get( 10228 "transcripts_info_json", transcripts_info_json_default 10229 ) 10230 10231 # Transcripts info field JSON 10232 if transcripts_info_field_json is None: 10233 transcripts_info_field_json = param.get("transcripts", {}).get( 10234 "transcripts_info_field_json", transcripts_info_field_json_default 10235 ) 10236 # if transcripts_info_field_json is not None and transcripts_info_json is None: 10237 # transcripts_info_json = transcripts_info_field_json 10238 10239 # Transcripts info format 10240 if transcripts_info_format is None: 10241 transcripts_info_format = param.get("transcripts", {}).get( 10242 "transcripts_info_format", transcripts_info_format_default 10243 ) 10244 10245 # Transcripts info field FORMAT 10246 if transcripts_info_field_format is None: 10247 transcripts_info_field_format = param.get("transcripts", {}).get( 10248 "transcripts_info_field_format", transcripts_info_field_format_default 10249 ) 10250 # if ( 10251 # transcripts_info_field_format is not None 10252 # and transcripts_info_format is None 10253 # ): 10254 # transcripts_info_format = transcripts_info_field_format 10255 10256 # Variants table 10257 table_variants = self.get_table_variants() 10258 10259 # Check info columns param 10260 if ( 10261 transcripts_info_json is None 10262 and transcripts_info_field_json is None 10263 and transcripts_info_format is None 10264 and transcripts_info_field_format is None 10265 ): 10266 return False 10267 10268 # Transcripts infos columns 10269 query_transcripts_infos_columns = f""" 10270 SELECT * 10271 FROM ( 10272 DESCRIBE SELECT * FROM {transcripts_table} 10273 ) 10274 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 10275 """ 10276 transcripts_infos_columns = list( 10277 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 10278 ) 10279 10280 # View results 10281 clause_select = [] 10282 clause_to_json = [] 10283 clause_to_format = [] 10284 for field in transcripts_infos_columns: 10285 clause_select.append( 10286 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10287 ) 10288 clause_to_json.append(f""" '{field}': "{field}" """) 10289 clause_to_format.append(f""" "{field}" """) 10290 10291 # Update 10292 update_set_json = [] 10293 update_set_format = [] 10294 10295 # VCF header 10296 vcf_reader = self.get_header() 10297 10298 # Transcripts to info column in JSON 10299 if transcripts_info_json is not None: 10300 10301 # Create column on variants table 10302 self.add_column( 10303 table_name=table_variants, 10304 column_name=transcripts_info_json, 10305 column_type="JSON", 10306 default_value=None, 10307 drop=False, 10308 ) 10309 10310 # Add header 10311 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 10312 transcripts_info_json, 10313 ".", 10314 "String", 10315 "Transcripts in JSON format", 10316 "unknwon", 10317 "unknwon", 10318 self.code_type_map["String"], 10319 ) 10320 10321 # Add to update 10322 update_set_json.append( 10323 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 10324 ) 10325 10326 # Transcripts to info field in JSON 10327 if transcripts_info_field_json is not None: 10328 10329 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 10330 10331 # Add to update 10332 update_set_json.append( 10333 f""" 10334 INFO = concat( 10335 CASE 10336 WHEN INFO NOT IN ('', '.') 10337 THEN INFO 10338 ELSE '' 10339 END, 10340 CASE 10341 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 10342 THEN concat( 10343 ';{transcripts_info_field_json}=', 10344 t.{transcripts_info_json} 10345 ) 10346 ELSE '' 10347 END 10348 ) 10349 """ 10350 ) 10351 10352 # Add header 10353 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 10354 transcripts_info_field_json, 10355 ".", 10356 "String", 10357 "Transcripts in JSON format", 10358 "unknwon", 10359 "unknwon", 10360 self.code_type_map["String"], 10361 ) 10362 10363 if update_set_json: 10364 10365 # Update query 10366 query_update = f""" 10367 UPDATE {table_variants} 10368 SET {", ".join(update_set_json)} 10369 FROM 10370 ( 10371 SELECT 10372 "#CHROM", POS, REF, ALT, 10373 concat( 10374 '{{', 10375 string_agg( 10376 '"' || "{transcripts_column_id}" || '":' || 10377 to_json(json_output) 10378 ), 10379 '}}' 10380 )::JSON AS {transcripts_info_json} 10381 FROM 10382 ( 10383 SELECT 10384 "#CHROM", POS, REF, ALT, 10385 "{transcripts_column_id}", 10386 to_json( 10387 {{{",".join(clause_to_json)}}} 10388 )::JSON AS json_output 10389 FROM 10390 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10391 WHERE "{transcripts_column_id}" IS NOT NULL 10392 ) 10393 GROUP BY "#CHROM", POS, REF, ALT 10394 ) AS t 10395 WHERE {table_variants}."#CHROM" = t."#CHROM" 10396 AND {table_variants}."POS" = t."POS" 10397 AND {table_variants}."REF" = t."REF" 10398 AND {table_variants}."ALT" = t."ALT" 10399 """ 10400 10401 self.execute_query(query=query_update) 10402 10403 # Transcripts to info column in FORMAT 10404 if transcripts_info_format is not None: 10405 10406 # Create column on variants table 10407 self.add_column( 10408 table_name=table_variants, 10409 column_name=transcripts_info_format, 10410 column_type="VARCHAR", 10411 default_value=None, 10412 drop=False, 10413 ) 10414 10415 # Add header 10416 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 10417 transcripts_info_format, 10418 ".", 10419 "String", 10420 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10421 "unknwon", 10422 "unknwon", 10423 self.code_type_map["String"], 10424 ) 10425 10426 # Add to update 10427 update_set_format.append( 10428 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 10429 ) 10430 10431 # Transcripts to info field in JSON 10432 if transcripts_info_field_format is not None: 10433 10434 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 10435 10436 # Add to update 10437 update_set_format.append( 10438 f""" 10439 INFO = concat( 10440 CASE 10441 WHEN INFO NOT IN ('', '.') 10442 THEN INFO 10443 ELSE '' 10444 END, 10445 CASE 10446 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 10447 THEN concat( 10448 ';{transcripts_info_field_format}=', 10449 t.{transcripts_info_format} 10450 ) 10451 ELSE '' 10452 END 10453 ) 10454 """ 10455 ) 10456 10457 # Add header 10458 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 10459 transcripts_info_field_format, 10460 ".", 10461 "String", 10462 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 10463 "unknwon", 10464 "unknwon", 10465 self.code_type_map["String"], 10466 ) 10467 10468 if update_set_format: 10469 10470 # Update query 10471 query_update = f""" 10472 UPDATE {table_variants} 10473 SET {", ".join(update_set_format)} 10474 FROM 10475 ( 10476 SELECT 10477 "#CHROM", POS, REF, ALT, 10478 string_agg({transcripts_info_format}) AS {transcripts_info_format} 10479 FROM 10480 ( 10481 SELECT 10482 "#CHROM", POS, REF, ALT, 10483 "{transcripts_column_id}", 10484 concat( 10485 "{transcripts_column_id}", 10486 '|', 10487 {", '|', ".join(clause_to_format)} 10488 ) AS {transcripts_info_format} 10489 FROM 10490 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 10491 ) 10492 GROUP BY "#CHROM", POS, REF, ALT 10493 ) AS t 10494 WHERE {table_variants}."#CHROM" = t."#CHROM" 10495 AND {table_variants}."POS" = t."POS" 10496 AND {table_variants}."REF" = t."REF" 10497 AND {table_variants}."ALT" = t."ALT" 10498 """ 10499 10500 self.execute_query(query=query_update) 10501 10502 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.